Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support single-year season codes #611

Merged
merged 1 commit into from
Jun 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 183 additions & 46 deletions soccerdata/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import warnings
from abc import ABC, abstractmethod
from datetime import date, datetime, timedelta
from enum import Enum
from pathlib import Path
from typing import IO, Callable, Dict, Iterable, List, Optional, Union

Expand All @@ -23,6 +24,178 @@
from ._config import DATA_DIR, LEAGUE_DICT, MAXAGE, logger


class SeasonCode(Enum):
"""How to interpret season codes.

Attributes
----------
SINGLE_YEAR: The season code is a single year, e.g. '2021'.
MULTI_YEAR: The season code is a range of years, e.g. '2122'.
"""

SINGLE_YEAR = "single-year"
MULTI_YEAR = "multi-year"

@staticmethod
def from_league(league: str) -> "SeasonCode":
"""Return the default season code for a league.

Parameters
----------
league : str
The league to consider.

Returns
-------
SeasonCode
The season code format to use.
"""
assert league in LEAGUE_DICT, f"Unknown league '{league}'"
select_league_dict = LEAGUE_DICT[league]
if "season_code" in select_league_dict:
return SeasonCode(select_league_dict["season_code"])
start_month = datetime.strptime(
select_league_dict.get("season_start", "Aug"),
"%b",
).month
end_month = datetime.strptime(
select_league_dict.get("season_end", "May"),
"%b",
).month
return SeasonCode.MULTI_YEAR if (end_month - start_month) < 0 else SeasonCode.SINGLE_YEAR

@staticmethod
def from_leagues(leagues: List[str]) -> "SeasonCode":
"""Determine the season code to use for a set of leagues.

If the given leagues have different default season codes,
the multi-year format is usded.

Parameters
----------
leagues : list of str
The leagues to consider.

Returns
-------
SeasonCode
The season code format to use.
"""
season_codes = {SeasonCode.from_league(league) for league in leagues}
if len(season_codes) == 1:
return season_codes.pop()
warnings.warn(
"The leagues have different default season codes. Using multi-year season codes.",
stacklevel=2,
)
return SeasonCode.MULTI_YEAR

def parse(self, season: Union[str, int]) -> str: # noqa: C901
"""Convert a string or int to a standard season format."""
season = str(season)
patterns = [
(
re.compile(r"^[0-9]{4}$"), # 1994 | 9495
lambda s: process_four_digit_year(s),
),
(
re.compile(r"^[0-9]{2}$"), # 94
lambda s: process_two_digit_year(s),
),
(
re.compile(r"^[0-9]{4}-[0-9]{4}$"), # 1994-1995
lambda s: process_full_year_range(s),
),
(
re.compile(r"^[0-9]{4}/[0-9]{4}$"), # 1994/1995
lambda s: process_full_year_range(s.replace("/", "-")),
),
(
re.compile(r"^[0-9]{4}-[0-9]{2}$"), # 1994-95
lambda s: process_partial_year_range(s),
),
(
re.compile(r"^[0-9]{2}-[0-9]{2}$"), # 94-95
lambda s: process_short_year_range(s),
),
(
re.compile(r"^[0-9]{2}/[0-9]{2}$"), # 94/95
lambda s: process_short_year_range(s.replace("/", "-")),
),
]

current_year = datetime.now().year

def process_four_digit_year(season: str) -> str:
"""Process a 4-digit string like '1994' or '9495'."""
if self == SeasonCode.MULTI_YEAR:
if int(season[2:]) == int(season[:2]) + 1:
if season == "2021":
msg = 'Season id "{}" is ambiguous: interpreting as "{}-{}"'.format(
season, season[:2], season[-2:]
)
warnings.warn(msg, stacklevel=1)
return season
elif season[2:] == "99":
return "9900"
else:
return season[-2:] + f"{int(season[-2:]) + 1:02d}"
else:
if season == "1920":
return "1919"
elif season == "2021":
return "2020"
elif season[:2] == "19" or season[:2] == "20":
return season
elif int(season) <= current_year:
return "20" + season[:2]
else:
return "19" + season[:2]

def process_two_digit_year(season: str) -> str:
"""Process a 2-digit string like '94'."""
if self == SeasonCode.MULTI_YEAR:
if season == "99":
return "9900"
else:
return season + f"{int(season) + 1:02d}"
else:
if int("20" + season) <= current_year:
return "20" + season
else:
return "19" + season

def process_full_year_range(season: str) -> str:
"""Process a range of 4-digit strings like '1994-1995'."""
if self == SeasonCode.MULTI_YEAR:
return season[2:4] + season[-2:]
else:
return season[:4]

def process_partial_year_range(season: str) -> str:
"""Process a range of 4-digit and 2-digit string like '1994-95'."""
if self == SeasonCode.MULTI_YEAR:
return season[2:4] + season[-2:]
else:
return season[:4]

def process_short_year_range(season: str) -> str:
"""Process a range of 2-digit strings like '94-95'."""
if self == SeasonCode.MULTI_YEAR:
return season[:2] + season[-2:]
else:
if int("20" + season[:2]) <= current_year:
return "20" + season[:2]
else:
return "19" + season[:2]

for pattern, action in patterns:
if pattern.match(season):
return action(season)

raise ValueError(f"Unrecognized season code: '{season}'")


class BaseReader(ABC):
"""Base class for data readers.

Expand Down Expand Up @@ -255,6 +428,10 @@ def _selected_leagues(self, ids: Optional[Union[str, List[str]]] = None) -> None
tmp_league_dict[i] = self._all_leagues()[i]
self._leagues_dict = tmp_league_dict

@property
def _season_code(self) -> SeasonCode:
return SeasonCode.from_leagues(self.leagues)

def _is_complete(self, league: str, season: str) -> bool:
"""Check if a season is complete."""
if league in LEAGUE_DICT:
Expand Down Expand Up @@ -293,7 +470,7 @@ def seasons(self, seasons: Optional[Union[str, int, Iterable[Union[str, int]]]])
seasons = [f"{y - 1}-{y}" for y in range(year, year - 6, -1)]
if isinstance(seasons, str) or isinstance(seasons, int):
seasons = [seasons]
self._season_ids = [season_code(s) for s in seasons]
self._season_ids = [self._season_code.parse(s) for s in seasons]


class BaseRequestsReader(BaseReader):
Expand Down Expand Up @@ -322,7 +499,7 @@ def __init__(

def _init_session(self) -> requests.Session:
session = cloudscraper.create_scraper(
browser={'browser': 'chrome', 'platform': 'linux', 'mobile': False}
browser={"browser": "chrome", "platform": "linux", "mobile": False}
)
session.proxies.update(self.proxy())
return session
Expand All @@ -343,7 +520,7 @@ def _download_and_save(
if isinstance(var, str):
var = [var]
var_names = "|".join(var)
template_understat = br"(%b)+[\s\t]*=[\s\t]*JSON\.parse\('(.*)'\)"
template_understat = rb"(%b)+[\s\t]*=[\s\t]*JSON\.parse\('(.*)'\)"
pattern_understat = template_understat % bytes(var_names, encoding="utf-8")
results = re.findall(pattern_understat, response.content)
data = {
Expand All @@ -359,7 +536,9 @@ def _download_and_save(
return io.BytesIO(payload)
except Exception:
logger.exception(
"Error while scraping %s. Retrying... (attempt %d of 5).", url, i + 1
"Error while scraping %s. Retrying... (attempt %d of 5).",
url,
i + 1,
)
self._session = self._init_session()
continue
Expand Down Expand Up @@ -479,48 +658,6 @@ def _download_and_save( # noqa: C901
raise ConnectionError("Could not download %s." % url)


def season_code(season: Union[str, int]) -> str: # noqa: C901
"""Convert a string or int to a season code like '1718'."""
season = str(season)
pat1 = re.compile(r"^[0-9]{4}$") # 1994 | 9495
pat2 = re.compile(r"^[0-9]{2}$") # 94
pat3 = re.compile(r"^[0-9]{4}-[0-9]{4}$") # 1994-1995
pat4 = re.compile(r"^[0-9]{4}/[0-9]{4}$") # 1994/1995
pat5 = re.compile(r"^[0-9]{4}-[0-9]{2}$") # 1994-95
pat6 = re.compile(r"^[0-9]{2}-[0-9]{2}$") # 94-95
pat7 = re.compile(r"^[0-9]{2}/[0-9]{2}$") # 94/95

if re.match(pat1, season):
if int(season[2:]) == int(season[:2]) + 1:
if season == "2021":
msg = 'Season id "{}" is ambiguous: interpreting as "{}-{}"'.format(
season, season[:2], season[-2:]
)
warnings.warn(msg, stacklevel=1)
return season # 9495
elif season[2:] == "99":
return "".join([season[2:], "00"]) # 1999
else:
return "".join([season[-2:], f"{int(season[-2:]) + 1:02d}"]) # 1994
elif re.match(pat2, season):
if season == "99":
return "".join([season, "00"]) # 99
else:
return "".join([season, f"{int(season) + 1:02d}"]) # 94
elif re.match(pat3, season):
return "".join([season[2:4], season[-2:]]) # 1994-1995
elif re.match(pat4, season):
return "".join([season[2:4], season[-2:]]) # 1994/1995
elif re.match(pat5, season):
return "".join([season[2:4], season[-2:]]) # 1994-95
elif re.match(pat6, season):
return "".join([season[:2], season[-2:]]) # 94-95
elif re.match(pat7, season):
return "".join([season[:2], season[-2:]]) # 94/95
else:
return season


def make_game_id(row: pd.Series) -> str:
"""Return a game id based on date, home and away team."""
if pd.isnull(row["date"]):
Expand Down
17 changes: 11 additions & 6 deletions soccerdata/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
from rich.logging import RichHandler

# Configuration
NOCACHE = os.environ.get("SOCCERDATA_NOCACHE", 'False').lower() in ('true', '1', 't')
NOSTORE = os.environ.get("SOCCERDATA_NOSTORE", 'False').lower() in ('true', '1', 't')
NOCACHE = os.environ.get("SOCCERDATA_NOCACHE", "False").lower() in ("true", "1", "t")
NOSTORE = os.environ.get("SOCCERDATA_NOSTORE", "False").lower() in ("true", "1", "t")
MAXAGE = None
if os.environ.get("SOCCERDATA_MAXAGE") is not None:
MAXAGE = int(os.environ.get("SOCCERDATA_MAXAGE", 0))
LOGLEVEL = os.environ.get('SOCCERDATA_LOGLEVEL', 'INFO').upper()
LOGLEVEL = os.environ.get("SOCCERDATA_LOGLEVEL", "INFO").upper()

# Directories
BASE_DIR = Path(os.environ.get("SOCCERDATA_DIR", Path.home() / "soccerdata"))
Expand Down Expand Up @@ -78,11 +78,14 @@
TEAMNAME_REPLACEMENTS = {}
_f_custom_teamnname_replacements = CONFIG_DIR / "teamname_replacements.json"
if _f_custom_teamnname_replacements.is_file():
with open(_f_custom_teamnname_replacements, encoding='utf8') as json_file:
with open(_f_custom_teamnname_replacements, encoding="utf8") as json_file:
for team, to_replace_list in json.load(json_file).items():
for to_replace in to_replace_list:
TEAMNAME_REPLACEMENTS[to_replace] = team
logger.info("Custom team name replacements loaded from %s.", _f_custom_teamnname_replacements)
logger.info(
"Custom team name replacements loaded from %s.",
_f_custom_teamnname_replacements,
)
else:
logger.info(
"No custom team name replacements found. You can configure these in %s.",
Expand Down Expand Up @@ -166,16 +169,18 @@
"FBref": "FIFA World Cup",
"FotMob": "INT-World Cup",
"WhoScored": "International - FIFA World Cup",
"season_code": "single-year",
},
"INT-Women's World Cup": {
"FBref": "FIFA Women's World Cup",
"FotMob": "INT-Women's World Cup",
"WhoScored": "International - FIFA Women's World Cup",
"season_code": "single-year",
},
}
_f_custom_league_dict = CONFIG_DIR / "league_dict.json"
if _f_custom_league_dict.is_file():
with open(_f_custom_league_dict, encoding='utf8') as json_file:
with open(_f_custom_league_dict, encoding="utf8") as json_file:
LEAGUE_DICT = {**LEAGUE_DICT, **json.load(json_file)}
logger.info("Custom league dict loaded from %s.", _f_custom_league_dict)
else:
Expand Down
14 changes: 10 additions & 4 deletions soccerdata/fbref.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

from ._common import (
BaseRequestsReader,
SeasonCode,
make_game_id,
season_code,
standardize_colnames,
)
from ._config import DATA_DIR, NOCACHE, NOSTORE, TEAMNAME_REPLACEMENTS, logger
Expand Down Expand Up @@ -117,6 +117,12 @@ def _all_leagues(cls) -> Dict[str, str]:
res.update({"Big 5 European Leagues Combined": "Big 5 European Leagues Combined"})
return res

@property
def _season_code(self) -> SeasonCode:
if "Big 5 European Leagues Combined" in self.leagues:
return SeasonCode.MULTI_YEAR
return SeasonCode.from_leagues(self.leagues)

def _is_complete(self, league: str, season: str) -> bool:
"""Check if a season is complete."""
if league == "Big 5 European Leagues Combined":
Expand Down Expand Up @@ -158,8 +164,8 @@ def read_leagues(self, split_up_big5: bool = False) -> pd.DataFrame:
.set_index("league")
.sort_index()
)
df["first_season"] = df["first_season"].apply(season_code)
df["last_season"] = df["last_season"].apply(season_code)
df["first_season"] = df["first_season"].apply(self._season_code.parse)
df["last_season"] = df["last_season"].apply(self._season_code.parse)

leagues = self.leagues
if "Big 5 European Leagues Combined" in self.leagues and split_up_big5:
Expand Down Expand Up @@ -212,7 +218,7 @@ def read_seasons(self, split_up_big5: bool = False) -> pd.DataFrame:

df = pd.concat(seasons).pipe(standardize_colnames)
df = df.rename(columns={"competition_name": "league"})
df["season"] = df["season"].apply(season_code)
df["season"] = df["season"].apply(self._season_code.parse)
# if both a 20xx and 19xx season are available, drop the 19xx season
df.drop_duplicates(subset=["league", "season"], keep="first", inplace=True)
df = df.set_index(["league", "season"]).sort_index()
Expand Down
Loading
Loading