Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

versions_update #179

Merged
merged 8 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.1.1
current_version = 1.2.0
commit = True
tag = True

Expand Down
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
Changelog
=========

[1.2.0] - 2023-12-11
---------------------
### Added
- A custom function 'convert_to_datetime' that replaces the usage of native `pd.to_datetime`. This function handles various date formats, especially when the date presents only a year. This is to handle pandas >2.0 which deprecated `infer_datetime_format`.

[1.2.0] - 2023-06-27
### Changed
- Upgraded the versions of various dependencies in `poetry.lock`, including 'anyio', 'astroid', 'asttokens', 'bumpversion', 'pandas' etc.
- Minor code changes to improve structure and readability. This includes reducing explicit regex flag usage in `str.replace` and reordering some assignments.


[1.1.1] - 2023-06-27
--------------------

- Added a new feature: `imf_weo` module in `import_tools` with an object
Expand Down
2 changes: 1 addition & 1 deletion bblocks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.1.1"
__version__ = "1.2.0"

# Easy access to importers
from bblocks.import_tools.world_bank import WorldBankData
Expand Down
14 changes: 7 additions & 7 deletions bblocks/analysis_tools/get.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import datetime
from operator import xor

import pandas as pd

from bblocks.cleaning_tools.clean import convert_to_datetime
from bblocks.logger import logger


Expand Down Expand Up @@ -43,11 +45,11 @@ def __validate_cols(
if col not in d.columns:
raise ValueError(f"{col} not found in _data")

if not pd.api.types.is_datetime64_any_dtype(sdate):
sdate = pd.to_datetime(sdate, infer_datetime_format=True)
if not isinstance(sdate, datetime.datetime):
sdate = convert_to_datetime(sdate)

if not pd.api.types.is_datetime64_any_dtype(edate):
edate = pd.to_datetime(edate, infer_datetime_format=True)
if not isinstance(edate, datetime.datetime):
edate = convert_to_datetime(edate)

return sdate, edate, date_col, value_col, grouper

Expand Down Expand Up @@ -81,9 +83,7 @@ def period_avg(

# Check that date column is date and if not convert it
if not pd.api.types.is_datetime64_any_dtype(data[date_column]):
data[date_column] = pd.to_datetime(
data[date_column], infer_datetime_format=True
)
data[date_column] = convert_to_datetime(data[date_column])
logger.info(f"Converted {date_column} to datetime")

# Validate args
Expand Down
37 changes: 34 additions & 3 deletions bblocks/cleaning_tools/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Type

import country_converter as coco
import numpy as np
import pandas as pd
from numpy import nan

Expand Down Expand Up @@ -90,15 +91,15 @@

if pd.api.types.is_numeric_dtype(series):
try:
return pd.to_datetime(series, format="%Y")
return convert_to_datetime(series)

except ValueError:
raise ValueError(
f"could not parse date format in. "
f"To fix, convert column to datetime"
)
if date_format is None:
return pd.to_datetime(series, infer_datetime_format=True)
return convert_to_datetime(series)
else:
return pd.to_datetime(series, format=date_format)

Expand Down Expand Up @@ -162,7 +163,7 @@

if not pd.api.types.is_datetime64_any_dtype(series):
try:
series = pd.to_datetime(series, infer_datetime_format=True)
series = convert_to_datetime(series)
except ValueError:
raise ValueError(
f"could not parse date format in. "
Expand Down Expand Up @@ -240,3 +241,33 @@
return series.map(formats["as_billions"].format)

return series.map(f"{other_format}".format)


def convert_to_datetime(date: str | int | pd.Series) -> pd.Series | pd.Timestamp:
"""
Custom function to convert values to datetime.
It handles integers or strings that represent only a year.
"""

if isinstance(date, pd.Series):
# Find the first non-null element in the series to determine format
first_valid_index = date.first_valid_index()
if first_valid_index is None:
return date.apply(lambda x: pd.NaT)

Check warning on line 256 in bblocks/cleaning_tools/clean.py

View check run for this annotation

Codecov / codecov/patch

bblocks/cleaning_tools/clean.py#L256

Added line #L256 was not covered by tests

# Get the first valid value
format_value = date[first_valid_index]
else:
format_value = date

# Determine if the value is a year (integer or 4-digit string)
if isinstance(format_value, (np.integer, int)) or (
isinstance(format_value, str)
and len(format_value) == 4
and format_value.isdigit()
):
format_str = "%Y"
else:
format_str = None # Let pd.to_datetime infer the format

return pd.to_datetime(date, errors="coerce", format=format_str)
7 changes: 2 additions & 5 deletions bblocks/dataframe_tools/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

from bblocks.cleaning_tools.clean import convert_id
from bblocks.cleaning_tools.clean import convert_id, convert_to_datetime
from bblocks.dataframe_tools.common import (
get_population_df,
get_poverty_ratio_df,
Expand Down Expand Up @@ -37,7 +37,6 @@ def __validate_add_column_params(
df["id_"] = convert_id(df[id_column], id_type)

if date_column is not None:

if pd.api.types.is_numeric_dtype(df[date_column]):
try:
df["merge_year"] = pd.to_datetime(df[date_column], format="%Y").dt.year
Expand All @@ -48,9 +47,7 @@ def __validate_add_column_params(
)
else:
try:
df["merge_year"] = pd.to_datetime(
df[date_column], infer_datetime_format=True
).dt.year
df["merge_year"] = convert_to_datetime(df[date_column]).dt.year

except ValueError:
raise ValueError(
Expand Down
4 changes: 2 additions & 2 deletions bblocks/dataframe_tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pandas as pd

from bblocks.cleaning_tools.clean import convert_to_datetime
from bblocks.cleaning_tools.filter import filter_latest_by
from bblocks.import_tools.imf import WorldEconomicOutlook
from bblocks.import_tools.world_bank import WorldBankData
Expand Down Expand Up @@ -47,7 +48,6 @@ def _get_weo_indicator(
update: bool,
include_estimates: bool = True,
) -> pd.DataFrame:

# Create a World Economic Outlook object
weo = WorldEconomicOutlook().load_data(indicator=indicator)

Expand All @@ -57,7 +57,7 @@ def _get_weo_indicator(

# Get the _data
data = weo.get_data(keep_metadata=True).assign(
year=lambda d: pd.to_datetime(d.year, format="%Y")
year=lambda d: convert_to_datetime(d.year)
)

# Filter the _data to keep only non-estimates if needed
Expand Down
1 change: 0 additions & 1 deletion bblocks/import_tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ def get_data(self, indicators: str | list = "all", **kwargs) -> pd.DataFrame:
indicators_ = self._data.values()

if isinstance(indicators, list):

for _ in indicators:
if _ not in self._data:
logger.warning(f"{_} not loaded or is an invalid indicator.")
Expand Down
3 changes: 2 additions & 1 deletion bblocks/import_tools/debt/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import requests
from numpy import nan

from bblocks.cleaning_tools.clean import convert_to_datetime
from bblocks.config import BBPaths


Expand Down Expand Up @@ -56,7 +57,7 @@ def __clean_dsa(df: pd.DataFrame) -> pd.DataFrame:
)
.dropna(subset=["country"])
.replace({"…": nan, "": nan})
.assign(latest_publication=lambda d: pd.to_datetime(d.latest_publication))
.assign(latest_publication=lambda d: convert_to_datetime(d.latest_publication))
.reset_index(drop=True)
)

Expand Down
3 changes: 0 additions & 3 deletions bblocks/import_tools/ilo.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,12 +195,10 @@ def load_data(self, indicator: str | list) -> ImportData:
indicator = [indicator]

for ind in indicator: # loop through indicators

path = BBPaths.raw_data / f"{ind}.csv"

# download data if not saved to disk
if not path.exists():

# download glossaries if not loaded to object
if self._glossaries is None:
self._load_glossaries()
Expand Down Expand Up @@ -245,7 +243,6 @@ def update_data(self, reload_data: bool = True) -> ImportData:
self._load_area_dict()

for ind in self._data: # loop through loaded indicators

# download data
download_data(
ind, BBPaths.raw_data / f"{ind}.csv", self._glossaries, self._area_dict
Expand Down
6 changes: 2 additions & 4 deletions bblocks/import_tools/imf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from weo import all_releases, download, WEO

from bblocks import config
from bblocks.cleaning_tools.clean import clean_numeric_series
from bblocks.cleaning_tools.clean import clean_numeric_series, convert_to_datetime
from bblocks.import_tools.common import ImportData
from bblocks.logger import logger

Expand Down Expand Up @@ -127,15 +127,14 @@ def __load_data(
.rename(columns=names)
.melt(id_vars=names.values(), var_name="year", value_name="value")
.assign(
year=lambda d: pd.to_datetime(d.year, format="%Y"),
year=lambda d: convert_to_datetime(d.year),
value=lambda d: clean_numeric_series(d.value),
)
.dropna(subset=["value"])
.reset_index(drop=True)
)

def _check_indicators(self, indicators: str | list | None = None) -> None | dict:

if self._raw_data is None:
self.__load_data()

Expand Down Expand Up @@ -221,7 +220,6 @@ def available_indicators(self) -> None:
def get_data(
self, indicators: str | list = "all", keep_metadata: bool = False
) -> pd.DataFrame:

df = super().get_data(indicators=indicators)

if not keep_metadata:
Expand Down
4 changes: 2 additions & 2 deletions bblocks/import_tools/sdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from datetime import datetime
import calendar

from bblocks.cleaning_tools.clean import clean_numeric_series
from bblocks.cleaning_tools.clean import clean_numeric_series, convert_to_datetime
from bblocks.import_tools.common import ImportData
from bblocks.config import BBPaths
from bblocks.logger import logger
Expand Down Expand Up @@ -74,7 +74,7 @@ def clean_df(df: pd.DataFrame, date: str) -> pd.DataFrame:
.rename(columns={"variable": "indicator"})
.reset_index(drop=True)
.assign(date=date)
.assign(date=lambda d: pd.to_datetime(d.date))
.assign(date=lambda d: convert_to_datetime(d.date))
)


Expand Down
27 changes: 14 additions & 13 deletions bblocks/import_tools/world_bank.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import wbgapi as wb

from bblocks.cleaning_tools.clean import convert_to_datetime
from bblocks.config import BBPaths
from bblocks.import_tools.common import ImportData

Expand Down Expand Up @@ -55,9 +56,7 @@ def _get_wb_data(
f"{indicator}:T": "date",
}
)
.assign(
indicator_code=indicator, date=lambda d: pd.to_datetime(d.date, format="%Y")
)
.assign(indicator_code=indicator, date=lambda d: convert_to_datetime(d.date))
.sort_values(by=["iso_code", "date"])
.reset_index(drop=True)
.filter(["date", "iso_code", "indicator", "indicator_code", "value"], axis=1)
Expand Down Expand Up @@ -173,8 +172,8 @@ def clean_prices(df: pd.DataFrame) -> pd.DataFrame:
df.columns = df.iloc[3]
unit_dict = (
df.iloc[4]
.str.replace("(", "", regex=True)
.str.replace(")", "", regex=True)
.str.replace("(", "", regex=False)
.str.replace(")", "", regex=False)
.dropna()
.to_dict()
)
Expand All @@ -185,14 +184,16 @@ def clean_prices(df: pd.DataFrame) -> pd.DataFrame:
.replace("..", np.nan)
.reset_index(drop=True)
.melt(id_vars="period", var_name="indicator", value_name="value")
.assign(
units=lambda d: d.indicator.map(unit_dict),
period=lambda d: pd.to_datetime(d.period, format="%YM%m"),
indicator=lambda d: d.indicator.str.replace(
"*", "", regex=True
).str.strip(),
value=lambda d: pd.to_numeric(d.value, errors="coerce"),
)
)

df = df.assign(
units=lambda d: d.indicator.map(unit_dict),
period=lambda d: pd.to_datetime(d.period, format="%YM%m"),
)

df = df.assign(
indicator=lambda d: d.indicator.str.replace("*", "", regex=False).str.strip(),
value=lambda d: pd.to_numeric(d.value, errors="coerce"),
)

return df
Expand Down
1 change: 0 additions & 1 deletion bblocks/other_tools/dictionaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def update_dictionaries() -> None:


def g20_countries() -> dict:

return Dict(
{
x: convert(x, src="ISO3", to="name_short", not_found=None)
Expand Down
Loading
Loading