Skip to content

Commit

Permalink
feat: support single-year season codes
Browse files Browse the repository at this point in the history
This commit adds support for single-year season codes (e.g,. 2023) that are used
in leagues such as the MLS, in contrast to the multi-year season codes (e.g.,
2023/24) typically used in European football.
  • Loading branch information
probberechts committed Jun 16, 2024
1 parent 8119add commit 11573e6
Show file tree
Hide file tree
Showing 8 changed files with 417 additions and 252 deletions.
229 changes: 183 additions & 46 deletions soccerdata/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import warnings
from abc import ABC, abstractmethod
from datetime import date, datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import IO, Callable, Dict, Iterable, List, Optional, Union

Expand All @@ -23,6 +24,178 @@
from ._config import DATA_DIR, LEAGUE_DICT, MAXAGE, logger


class SeasonCode(Enum):
"""How to interpret season codes.
Attributes
----------
SINGLE_YEAR: The season code is a single year, e.g. '2021'.
MULTI_YEAR: The season code is a range of years, e.g. '2122'.
"""

SINGLE_YEAR = "single-year"
MULTI_YEAR = "multi-year"

@staticmethod
def from_league(league: str) -> "SeasonCode":
"""Return the default season code for a league.
Parameters
----------
league : str
The league to consider.
Returns
-------
SeasonCode
The season code format to use.
"""
assert league in LEAGUE_DICT, f"Unknown league '{league}'"
select_league_dict = LEAGUE_DICT[league]
if "season_code" in select_league_dict:
return SeasonCode(select_league_dict["season_code"])
start_month = datetime.strptime(
select_league_dict.get("season_start", "Aug"),
"%b",
).month
end_month = datetime.strptime(
select_league_dict.get("season_end", "May"),
"%b",
).month
return SeasonCode.MULTI_YEAR if (end_month - start_month) < 0 else SeasonCode.SINGLE_YEAR

@staticmethod
def from_leagues(leagues: List[str]) -> "SeasonCode":
"""Determine the season code to use for a set of leagues.
If the given leagues have different default season codes,
the multi-year format is usded.
Parameters
----------
leagues : list of str
The leagues to consider.
Returns
-------
SeasonCode
The season code format to use.
"""
season_codes = {SeasonCode.from_league(league) for league in leagues}
if len(season_codes) == 1:
return season_codes.pop()
warnings.warn(
"The leagues have different default season codes. Using multi-year season codes.",
stacklevel=2,
)
return SeasonCode.MULTI_YEAR

def parse(self, season: Union[str, int]) -> str: # noqa: C901
"""Convert a string or int to a standard season format."""
season = str(season)
patterns = [
(
re.compile(r"^[0-9]{4}$"), # 1994 | 9495
lambda s: process_four_digit_year(s),
),
(
re.compile(r"^[0-9]{2}$"), # 94
lambda s: process_two_digit_year(s),
),
(
re.compile(r"^[0-9]{4}-[0-9]{4}$"), # 1994-1995
lambda s: process_full_year_range(s),
),
(
re.compile(r"^[0-9]{4}/[0-9]{4}$"), # 1994/1995
lambda s: process_full_year_range(s.replace("/", "-")),
),
(
re.compile(r"^[0-9]{4}-[0-9]{2}$"), # 1994-95
lambda s: process_partial_year_range(s),
),
(
re.compile(r"^[0-9]{2}-[0-9]{2}$"), # 94-95
lambda s: process_short_year_range(s),
),
(
re.compile(r"^[0-9]{2}/[0-9]{2}$"), # 94/95
lambda s: process_short_year_range(s.replace("/", "-")),
),
]

current_year = datetime.now().year

def process_four_digit_year(season: str) -> str:
"""Process a 4-digit string like '1994' or '9495'."""
if self == SeasonCode.MULTI_YEAR:
if int(season[2:]) == int(season[:2]) + 1:
if season == "2021":
msg = 'Season id "{}" is ambiguous: interpreting as "{}-{}"'.format(
season, season[:2], season[-2:]
)
warnings.warn(msg, stacklevel=1)
return season
elif season[2:] == "99":
return "9900"
else:
return season[-2:] + f"{int(season[-2:]) + 1:02d}"
else:
if season == "1920":
return "1919"
elif season == "2021":
return "2020"
elif season[:2] == "19" or season[:2] == "20":
return season
elif int(season) <= current_year:
return "20" + season[:2]
else:
return "19" + season[:2]

def process_two_digit_year(season: str) -> str:
"""Process a 2-digit string like '94'."""
if self == SeasonCode.MULTI_YEAR:
if season == "99":
return "9900"
else:
return season + f"{int(season) + 1:02d}"
else:
if int("20" + season) <= current_year:
return "20" + season
else:
return "19" + season

def process_full_year_range(season: str) -> str:
"""Process a range of 4-digit strings like '1994-1995'."""
if self == SeasonCode.MULTI_YEAR:
return season[2:4] + season[-2:]
else:
return season[:4]

def process_partial_year_range(season: str) -> str:
"""Process a range of 4-digit and 2-digit string like '1994-95'."""
if self == SeasonCode.MULTI_YEAR:
return season[2:4] + season[-2:]
else:
return season[:4]

def process_short_year_range(season: str) -> str:
"""Process a range of 2-digit strings like '94-95'."""
if self == SeasonCode.MULTI_YEAR:
return season[:2] + season[-2:]
else:
if int("20" + season[:2]) <= current_year:
return "20" + season[:2]
else:
return "19" + season[:2]

for pattern, action in patterns:
if pattern.match(season):
return action(season)

raise ValueError(f"Unrecognized season code: '{season}'")


class BaseReader(ABC):
"""Base class for data readers.
Expand Down Expand Up @@ -255,6 +428,10 @@ def _selected_leagues(self, ids: Optional[Union[str, List[str]]] = None) -> None
tmp_league_dict[i] = self._all_leagues()[i]
self._leagues_dict = tmp_league_dict

@property
def _season_code(self) -> SeasonCode:
return SeasonCode.from_leagues(self.leagues)

def _is_complete(self, league: str, season: str) -> bool:
"""Check if a season is complete."""
if league in LEAGUE_DICT:
Expand Down Expand Up @@ -293,7 +470,7 @@ def seasons(self, seasons: Optional[Union[str, int, Iterable[Union[str, int]]]])
seasons = [f"{y - 1}-{y}" for y in range(year, year - 6, -1)]
if isinstance(seasons, str) or isinstance(seasons, int):
seasons = [seasons]
self._season_ids = [season_code(s) for s in seasons]
self._season_ids = [self._season_code.parse(s) for s in seasons]


class BaseRequestsReader(BaseReader):
Expand Down Expand Up @@ -322,7 +499,7 @@ def __init__(

def _init_session(self) -> requests.Session:
session = cloudscraper.create_scraper(
browser={'browser': 'chrome', 'platform': 'linux', 'mobile': False}
browser={"browser": "chrome", "platform": "linux", "mobile": False}
)
session.proxies.update(self.proxy())
return session
Expand All @@ -343,7 +520,7 @@ def _download_and_save(
if isinstance(var, str):
var = [var]
var_names = "|".join(var)
template_understat = br"(%b)+[\s\t]*=[\s\t]*JSON\.parse\('(.*)'\)"
template_understat = rb"(%b)+[\s\t]*=[\s\t]*JSON\.parse\('(.*)'\)"
pattern_understat = template_understat % bytes(var_names, encoding="utf-8")
results = re.findall(pattern_understat, response.content)
data = {
Expand All @@ -359,7 +536,9 @@ def _download_and_save(
return io.BytesIO(payload)
except Exception:
logger.exception(
"Error while scraping %s. Retrying... (attempt %d of 5).", url, i + 1
"Error while scraping %s. Retrying... (attempt %d of 5).",
url,
i + 1,
)
self._session = self._init_session()
continue
Expand Down Expand Up @@ -479,48 +658,6 @@ def _download_and_save( # noqa: C901
raise ConnectionError("Could not download %s." % url)


def season_code(season: Union[str, int]) -> str: # noqa: C901
"""Convert a string or int to a season code like '1718'."""
season = str(season)
pat1 = re.compile(r"^[0-9]{4}$") # 1994 | 9495
pat2 = re.compile(r"^[0-9]{2}$") # 94
pat3 = re.compile(r"^[0-9]{4}-[0-9]{4}$") # 1994-1995
pat4 = re.compile(r"^[0-9]{4}/[0-9]{4}$") # 1994/1995
pat5 = re.compile(r"^[0-9]{4}-[0-9]{2}$") # 1994-95
pat6 = re.compile(r"^[0-9]{2}-[0-9]{2}$") # 94-95
pat7 = re.compile(r"^[0-9]{2}/[0-9]{2}$") # 94/95

if re.match(pat1, season):
if int(season[2:]) == int(season[:2]) + 1:
if season == "2021":
msg = 'Season id "{}" is ambiguous: interpreting as "{}-{}"'.format(
season, season[:2], season[-2:]
)
warnings.warn(msg, stacklevel=1)
return season # 9495
elif season[2:] == "99":
return "".join([season[2:], "00"]) # 1999
else:
return "".join([season[-2:], f"{int(season[-2:]) + 1:02d}"]) # 1994
elif re.match(pat2, season):
if season == "99":
return "".join([season, "00"]) # 99
else:
return "".join([season, f"{int(season) + 1:02d}"]) # 94
elif re.match(pat3, season):
return "".join([season[2:4], season[-2:]]) # 1994-1995
elif re.match(pat4, season):
return "".join([season[2:4], season[-2:]]) # 1994/1995
elif re.match(pat5, season):
return "".join([season[2:4], season[-2:]]) # 1994-95
elif re.match(pat6, season):
return "".join([season[:2], season[-2:]]) # 94-95
elif re.match(pat7, season):
return "".join([season[:2], season[-2:]]) # 94/95
else:
return season


def make_game_id(row: pd.Series) -> str:
"""Return a game id based on date, home and away team."""
if pd.isnull(row["date"]):
Expand Down
17 changes: 11 additions & 6 deletions soccerdata/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
from rich.logging import RichHandler

# Configuration
NOCACHE = os.environ.get("SOCCERDATA_NOCACHE", 'False').lower() in ('true', '1', 't')
NOSTORE = os.environ.get("SOCCERDATA_NOSTORE", 'False').lower() in ('true', '1', 't')
NOCACHE = os.environ.get("SOCCERDATA_NOCACHE", "False").lower() in ("true", "1", "t")
NOSTORE = os.environ.get("SOCCERDATA_NOSTORE", "False").lower() in ("true", "1", "t")
MAXAGE = None
if os.environ.get("SOCCERDATA_MAXAGE") is not None:
MAXAGE = int(os.environ.get("SOCCERDATA_MAXAGE", 0))
LOGLEVEL = os.environ.get('SOCCERDATA_LOGLEVEL', 'INFO').upper()
LOGLEVEL = os.environ.get("SOCCERDATA_LOGLEVEL", "INFO").upper()

# Directories
BASE_DIR = Path(os.environ.get("SOCCERDATA_DIR", Path.home() / "soccerdata"))
Expand Down Expand Up @@ -78,11 +78,14 @@
TEAMNAME_REPLACEMENTS = {}
_f_custom_teamnname_replacements = CONFIG_DIR / "teamname_replacements.json"
if _f_custom_teamnname_replacements.is_file():
with open(_f_custom_teamnname_replacements, encoding='utf8') as json_file:
with open(_f_custom_teamnname_replacements, encoding="utf8") as json_file:
for team, to_replace_list in json.load(json_file).items():
for to_replace in to_replace_list:
TEAMNAME_REPLACEMENTS[to_replace] = team
logger.info("Custom team name replacements loaded from %s.", _f_custom_teamnname_replacements)
logger.info(
"Custom team name replacements loaded from %s.",
_f_custom_teamnname_replacements,
)
else:
logger.info(
"No custom team name replacements found. You can configure these in %s.",
Expand Down Expand Up @@ -166,16 +169,18 @@
"FBref": "FIFA World Cup",
"FotMob": "INT-World Cup",
"WhoScored": "International - FIFA World Cup",
"season_code": "single-year",
},
"INT-Women's World Cup": {
"FBref": "FIFA Women's World Cup",
"FotMob": "INT-Women's World Cup",
"WhoScored": "International - FIFA Women's World Cup",
"season_code": "single-year",
},
}
_f_custom_league_dict = CONFIG_DIR / "league_dict.json"
if _f_custom_league_dict.is_file():
with open(_f_custom_league_dict, encoding='utf8') as json_file:
with open(_f_custom_league_dict, encoding="utf8") as json_file:
LEAGUE_DICT = {**LEAGUE_DICT, **json.load(json_file)}
logger.info("Custom league dict loaded from %s.", _f_custom_league_dict)
else:
Expand Down
14 changes: 10 additions & 4 deletions soccerdata/fbref.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from ._common import (
BaseRequestsReader,
SeasonCode,
make_game_id,
season_code,
standardize_colnames,
)
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger
Expand Down Expand Up @@ -117,6 +117,12 @@ def _all_leagues(cls) -> Dict[str, str]:
res.update({"Big 5 European Leagues Combined": "Big 5 European Leagues Combined"})
return res

@property
def _season_code(self) -> SeasonCode:
if "Big 5 European Leagues Combined" in self.leagues:
return SeasonCode.MULTI_YEAR
return SeasonCode.from_leagues(self.leagues)

def _is_complete(self, league: str, season: str) -> bool:
"""Check if a season is complete."""
if league == "Big 5 European Leagues Combined":
Expand Down Expand Up @@ -158,8 +164,8 @@ def read_leagues(self, split_up_big5: bool = False) -> pd.DataFrame:
.set_index("league")
.sort_index()
)
df["first_season"] = df["first_season"].apply(season_code)
df["last_season"] = df["last_season"].apply(season_code)
df["first_season"] = df["first_season"].apply(self._season_code.parse)
df["last_season"] = df["last_season"].apply(self._season_code.parse)

leagues = self.leagues
if "Big 5 European Leagues Combined" in self.leagues and split_up_big5:
Expand Down Expand Up @@ -212,7 +218,7 @@ def read_seasons(self, split_up_big5: bool = False) -> pd.DataFrame:

df = pd.concat(seasons).pipe(standardize_colnames)
df = df.rename(columns={"competition_name": "league"})
df["season"] = df["season"].apply(season_code)
df["season"] = df["season"].apply(self._season_code.parse)
# if both a 20xx and 19xx season are available, drop the 19xx season
df.drop_duplicates(subset=["league", "season"], keep="first", inplace=True)
df = df.set_index(["league", "season"]).sort_index()
Expand Down
Loading

0 comments on commit 11573e6

Please sign in to comment.