Skip to content

Commit

Permalink
Replace native pd.to_datetime usage with convert_to_datetime
Browse files Browse the repository at this point in the history
The code changes all instances of `pd.to_datetime` with a custom function `convert_to_datetime`. This function is able to correctly handle different date formats, particularly when the date information only represents a year. Additional changes include reducing explicit regex flag usage in `str.replace` and reordering some assignments for cleaner code structure.
  • Loading branch information
jm-rivera committed Dec 11, 2023
1 parent 3234ae4 commit a70bc94
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 37 deletions.
14 changes: 7 additions & 7 deletions bblocks/analysis_tools/get.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import datetime
from operator import xor

import pandas as pd

from bblocks.cleaning_tools.clean import convert_to_datetime
from bblocks.logger import logger


Expand Down Expand Up @@ -43,11 +45,11 @@ def __validate_cols(
if col not in d.columns:
raise ValueError(f"{col} not found in _data")

if not pd.api.types.is_datetime64_any_dtype(sdate):
sdate = pd.to_datetime(sdate, infer_datetime_format=True)
if not isinstance(sdate, datetime.datetime):
sdate = convert_to_datetime(sdate)

if not pd.api.types.is_datetime64_any_dtype(edate):
edate = pd.to_datetime(edate, infer_datetime_format=True)
if not isinstance(edate, datetime.datetime):
edate = convert_to_datetime(edate)

return sdate, edate, date_col, value_col, grouper

Expand Down Expand Up @@ -81,9 +83,7 @@ def period_avg(

# Check that date column is date and if not convert it
if not pd.api.types.is_datetime64_any_dtype(data[date_column]):
data[date_column] = pd.to_datetime(
data[date_column], infer_datetime_format=True
)
data[date_column] = convert_to_datetime(data[date_column])
logger.info(f"Converted {date_column} to datetime")

# Validate args
Expand Down
37 changes: 34 additions & 3 deletions bblocks/cleaning_tools/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Type

import country_converter as coco
import numpy as np
import pandas as pd
from numpy import nan

Expand Down Expand Up @@ -90,15 +91,15 @@ def to_date_column(series: pd.Series, date_format: str | None = None) -> pd.Seri

if pd.api.types.is_numeric_dtype(series):
try:
return pd.to_datetime(series, format="%Y")
return convert_to_datetime(series)

except ValueError:
raise ValueError(
f"could not parse date format in. "
f"To fix, convert column to datetime"
)
if date_format is None:
return pd.to_datetime(series, infer_datetime_format=True)
return convert_to_datetime(series)
else:
return pd.to_datetime(series, format=date_format)

Expand Down Expand Up @@ -162,7 +163,7 @@ def date_to_str(series: pd.Series, date_format: str = "%d %B %Y") -> pd.Series:

if not pd.api.types.is_datetime64_any_dtype(series):
try:
series = pd.to_datetime(series, infer_datetime_format=True)
series = convert_to_datetime(series)
except ValueError:
raise ValueError(
f"could not parse date format in. "
Expand Down Expand Up @@ -240,3 +241,33 @@ def format_number(
return series.map(formats["as_billions"].format)

return series.map(f"{other_format}".format)


def convert_to_datetime(date: str | int | pd.Series) -> pd.Series | pd.Timestamp:
"""
Custom function to convert values to datetime.
It handles integers or strings that represent only a year.
"""

if isinstance(date, pd.Series):
# Find the first non-null element in the series to determine format
first_valid_index = date.first_valid_index()
if first_valid_index is None:
return date.apply(lambda x: pd.NaT)

# Get the first valid value
format_value = date[first_valid_index]
else:
format_value = date

# Determine if the value is a year (integer or 4-digit string)
if isinstance(format_value, (np.integer, int)) or (
isinstance(format_value, str)
and len(format_value) == 4
and format_value.isdigit()
):
format_str = "%Y"
else:
format_str = None # Let pd.to_datetime infer the format

return pd.to_datetime(date, errors="coerce", format=format_str)
7 changes: 2 additions & 5 deletions bblocks/dataframe_tools/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pandas as pd

from bblocks.cleaning_tools.clean import convert_id
from bblocks.cleaning_tools.clean import convert_id, convert_to_datetime
from bblocks.dataframe_tools.common import (
get_population_df,
get_poverty_ratio_df,
Expand Down Expand Up @@ -37,7 +37,6 @@ def __validate_add_column_params(
df["id_"] = convert_id(df[id_column], id_type)

if date_column is not None:

if pd.api.types.is_numeric_dtype(df[date_column]):
try:
df["merge_year"] = pd.to_datetime(df[date_column], format="%Y").dt.year
Expand All @@ -48,9 +47,7 @@ def __validate_add_column_params(
)
else:
try:
df["merge_year"] = pd.to_datetime(
df[date_column], infer_datetime_format=True
).dt.year
df["merge_year"] = convert_to_datetime(df[date_column]).dt.year

except ValueError:
raise ValueError(
Expand Down
4 changes: 2 additions & 2 deletions bblocks/dataframe_tools/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import pandas as pd

from bblocks.cleaning_tools.clean import convert_to_datetime
from bblocks.cleaning_tools.filter import filter_latest_by
from bblocks.import_tools.imf import WorldEconomicOutlook
from bblocks.import_tools.world_bank import WorldBankData
Expand Down Expand Up @@ -47,7 +48,6 @@ def _get_weo_indicator(
update: bool,
include_estimates: bool = True,
) -> pd.DataFrame:

# Create a World Economic Outlook object
weo = WorldEconomicOutlook().load_data(indicator=indicator)

Expand All @@ -57,7 +57,7 @@ def _get_weo_indicator(

# Get the _data
data = weo.get_data(keep_metadata=True).assign(
year=lambda d: pd.to_datetime(d.year, format="%Y")
year=lambda d: convert_to_datetime(d.year)
)

# Filter the _data to keep only non-estimates if needed
Expand Down
3 changes: 2 additions & 1 deletion bblocks/import_tools/debt/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import requests
from numpy import nan

from bblocks.cleaning_tools.clean import convert_to_datetime
from bblocks.config import BBPaths


Expand Down Expand Up @@ -56,7 +57,7 @@ def __clean_dsa(df: pd.DataFrame) -> pd.DataFrame:
)
.dropna(subset=["country"])
.replace({"…": nan, "": nan})
.assign(latest_publication=lambda d: pd.to_datetime(d.latest_publication))
.assign(latest_publication=lambda d: convert_to_datetime(d.latest_publication))
.reset_index(drop=True)
)

Expand Down
6 changes: 2 additions & 4 deletions bblocks/import_tools/imf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from weo import all_releases, download, WEO

from bblocks import config
from bblocks.cleaning_tools.clean import clean_numeric_series
from bblocks.cleaning_tools.clean import clean_numeric_series, convert_to_datetime
from bblocks.import_tools.common import ImportData
from bblocks.logger import logger

Expand Down Expand Up @@ -127,15 +127,14 @@ def __load_data(
.rename(columns=names)
.melt(id_vars=names.values(), var_name="year", value_name="value")
.assign(
year=lambda d: pd.to_datetime(d.year, format="%Y"),
year=lambda d: convert_to_datetime(d.year),
value=lambda d: clean_numeric_series(d.value),
)
.dropna(subset=["value"])
.reset_index(drop=True)
)

def _check_indicators(self, indicators: str | list | None = None) -> None | dict:

if self._raw_data is None:
self.__load_data()

Expand Down Expand Up @@ -221,7 +220,6 @@ def available_indicators(self) -> None:
def get_data(
self, indicators: str | list = "all", keep_metadata: bool = False
) -> pd.DataFrame:

df = super().get_data(indicators=indicators)

if not keep_metadata:
Expand Down
4 changes: 2 additions & 2 deletions bblocks/import_tools/sdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from datetime import datetime
import calendar

from bblocks.cleaning_tools.clean import clean_numeric_series
from bblocks.cleaning_tools.clean import clean_numeric_series, convert_to_datetime
from bblocks.import_tools.common import ImportData
from bblocks.config import BBPaths
from bblocks.logger import logger
Expand Down Expand Up @@ -74,7 +74,7 @@ def clean_df(df: pd.DataFrame, date: str) -> pd.DataFrame:
.rename(columns={"variable": "indicator"})
.reset_index(drop=True)
.assign(date=date)
.assign(date=lambda d: pd.to_datetime(d.date))
.assign(date=lambda d: convert_to_datetime(d.date))
)


Expand Down
27 changes: 14 additions & 13 deletions bblocks/import_tools/world_bank.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import wbgapi as wb

from bblocks.cleaning_tools.clean import convert_to_datetime
from bblocks.config import BBPaths
from bblocks.import_tools.common import ImportData

Expand Down Expand Up @@ -55,9 +56,7 @@ def _get_wb_data(
f"{indicator}:T": "date",
}
)
.assign(
indicator_code=indicator, date=lambda d: pd.to_datetime(d.date, format="%Y")
)
.assign(indicator_code=indicator, date=lambda d: convert_to_datetime(d.date))
.sort_values(by=["iso_code", "date"])
.reset_index(drop=True)
.filter(["date", "iso_code", "indicator", "indicator_code", "value"], axis=1)
Expand Down Expand Up @@ -173,8 +172,8 @@ def clean_prices(df: pd.DataFrame) -> pd.DataFrame:
df.columns = df.iloc[3]
unit_dict = (
df.iloc[4]
.str.replace("(", "", regex=True)
.str.replace(")", "", regex=True)
.str.replace("(", "", regex=False)
.str.replace(")", "", regex=False)
.dropna()
.to_dict()
)
Expand All @@ -185,14 +184,16 @@ def clean_prices(df: pd.DataFrame) -> pd.DataFrame:
.replace("..", np.nan)
.reset_index(drop=True)
.melt(id_vars="period", var_name="indicator", value_name="value")
.assign(
units=lambda d: d.indicator.map(unit_dict),
period=lambda d: pd.to_datetime(d.period, format="%YM%m"),
indicator=lambda d: d.indicator.str.replace(
"*", "", regex=True
).str.strip(),
value=lambda d: pd.to_numeric(d.value, errors="coerce"),
)
)

df = df.assign(
units=lambda d: d.indicator.map(unit_dict),
period=lambda d: pd.to_datetime(d.period, format="%YM%m"),
)

df = df.assign(
indicator=lambda d: d.indicator.str.replace("*", "", regex=False).str.strip(),
value=lambda d: pd.to_numeric(d.value, errors="coerce"),
)

return df
Expand Down

0 comments on commit a70bc94

Please sign in to comment.