From 9fbf73a4cd9696643f9c676497a384200e7c5252 Mon Sep 17 00:00:00 2001 From: YurelyCamacho <49034451+YurelyCamacho@users.noreply.github.com> Date: Tue, 31 Jan 2023 08:41:42 -0400 Subject: [PATCH] docs: Review docstrings (#199) * Review docstrings * Review doscstring * Arrangement of number of characters in the lines * Update worldbank.py --- .../analysis/forecast_models/metrics.py | 29 ++-- .../forecast_models/ngboost_models.py | 141 ++++++++++-------- epigraphhub/analysis/preprocessing.py | 82 +++++----- epigraphhub/data/colombia/extract.py | 49 +++--- epigraphhub/data/epigraphhub_db.py | 46 +++--- epigraphhub/data/worldbank.py | 139 +++++++++-------- 6 files changed, 273 insertions(+), 213 deletions(-) diff --git a/epigraphhub/analysis/forecast_models/metrics.py b/epigraphhub/analysis/forecast_models/metrics.py index 935d7a5b..b1b000ce 100644 --- a/epigraphhub/analysis/forecast_models/metrics.py +++ b/epigraphhub/analysis/forecast_models/metrics.py @@ -9,17 +9,21 @@ # computing some metrics def compute_metrics(df_pred: pd.DataFrame) -> pd.DataFrame: """ - This function evaluates the predictions obtained in the `train_eval` method - in the train and test sample. The predictions must be saved in a dataset with the following columns: - 'median', 'target' and 'train_size'. - This function uses the following metrics: - - explained variance score; - - mean absolute error; - - mean squared error; - - root mean squared error; - - mean squared log error; - - mean absolute percentage error. - To compute this metrics we use the implementations of the sklearn.metrics package. + This function evaluates the predictions obtained in the `train_eval` + method in the train and test sample. The predictions must be saved + in a dataset with the following columns: 'median', 'target' and + 'train_size'. + + This function uses the following metrics: + + - explained variance score; + - mean absolute error; + - mean squared error; + - root mean squared error; + - mean squared log error; + - mean absolute percentage error. + To compute this metrics we use the implementations of the + sklearn.metrics package. Parameters ---------- @@ -29,7 +33,8 @@ def compute_metrics(df_pred: pd.DataFrame) -> pd.DataFrame: Returns ------- pd.DataFrame - Dataframe with two columns: out_sample and in_sample and with the metrics as index. + DataFrame with two columns: out_sample and in_sample and with + the metrics as index. """ metrics = [ diff --git a/epigraphhub/analysis/forecast_models/ngboost_models.py b/epigraphhub/analysis/forecast_models/ngboost_models.py index cf96a394..34445fea 100644 --- a/epigraphhub/analysis/forecast_models/ngboost_models.py +++ b/epigraphhub/analysis/forecast_models/ngboost_models.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 """ -The functions in this module allow the application of the -ngboost regressor model. There are separate methods to train and evaluate (separate -the data in train and test datasets), train with all the data available, and make -forecasts. +The functions in this module allow the application of the ngboost +regressor model. There are separate methods to train and evaluate +(separate the data in train and test datasets), train with all the data +available, and make forecasts. """ from typing import Union @@ -40,9 +40,9 @@ class NGBModel: """ - This class allows the user to create a ngboost model. The methods in this - class allows the user to train and evaluate the model, to train and save the model - and make the forecast using saved models. + This class allows the user to create a ngboost model. The methods + allows to train and evaluate the model, to train and save the model + and make the forecast using saved models. """ def __init__( @@ -57,19 +57,21 @@ def __init__( Parameters ---------- look_back : int - Number of the last days that will be used to forecast the next days. + Number of the last days that will be used to forecast the + next days. predict_n : int Number of days that will be predicted. validation_split : float - Proportion of training data to set aside as validation data for early stopping. + Proportion of training data to set aside as validation data + for early stopping. early_stop : int - The number of consecutive boosting iterations during which the - loss has to increase before the algorithm stops early. - Set to None to disable early stopping and validation. - None enables running over the full data set. - params_model : dictionary + The number of consecutive boosting iterations during which + the loss has to increase before the algorithm stops early. + Set to None to disable early stopping and validation. None + enables running over the full data set. + params_model : dict The dict with the params that will be used in the ngboost - regressor model. + regressor model. """ # This will remove the validation_fraction and early_stopping_rounds parameters since it shouldn't. @@ -98,44 +100,51 @@ def train_eval( save: bool = False, ) -> pd.DataFrame: """ - Function to apply a ngboost regressor model given a dataset and a target column. - This function will train multiple models, each one specilist in predict the X + n - days, of the target column, where n is in the range (1, number of days that you - want predict). - This function split the data in train and test dataset and returns the predictions - made using the test dataset. + Function to apply a ngboost regressor model given a dataset and + a target column. This function will train multiple models, each + one specilist in predict the X + n days, of the target column, + where n is in the range (1, number of days that you want + predict). This function split the data in train and test dataset + and returns the predictions made using the test dataset. Parameters ---------- target_name : str - Name of the target column. + Name of the target column. data : pd.DataFrame - Dataframe with features and target column. + DataFrame with features and target column. ini_date : str, optional - Determines the beggining of the train dataset, by default None + Determines the beggining of the train dataset, by default + None. end_train_date : str, optional - Determines the beggining of end of train dataset. If end_train_date - is not None, then ratio isn't used, by default None + Determines the beggining of end of train dataset. If is not + None, then ratio isn't used, by default None. end_date : str, optional - Determines the end of the dataset used in validation, by default None + Determines the end of the dataset used in validation, by + default None. ratio : float - Determines which percentage of the data will be used to train the model, by default 0.75 + Determines which percentage of the data will be used to + train the model, by default 0.75. path : str, optional - It indicates where save the models trained, by default None + It indicates where save the models trained, by default None. name : str, optional - It indicates which name use to save the models trained, by default None + It indicates which name use to save the models trained, by + default None. save : bool - If True the models trained are saved, by default False + If True the models trained are saved, by default False. Returns ------- pd.DataFrame - A dataframe with four columns (and a date index): - - target: The target values - - lower: The lower value of the confidence interval of 95% - - median: The median value of the confidence interval of 95% - - upper: The upper value of the confidence interval of 95% - - train_size: The number of rows of data using as training data. + A DataFrame with four columns (and a date index): + + - target: The target values. + - lower: The lower value of the confidence interval of 95%. + - median: The median value of the confidence interval of + 95%. + - upper: The upper value of the confidence interval of 95%. + - train_size: The number of rows of data using as training + data. """ df_lag = preprocess_data(data, self.look_back, ini_date, end_date) @@ -251,32 +260,35 @@ def train( name: str = "train_ngb", ) -> list: """ - Function to train multiple ngboost regressor models given a dataset and a target column. - This function will train multiple models, each one specilist in predict the X + n - days, of the target column, where n is in the range (1, number of days that you - want predict). - This function will train the model with all the data available and will save the model - that will be used to make forecasts. + Function to train multiple ngboost regressor models given a + dataset and a target column. This function will train multiple + models, each one specilist in predict the X + n days, of the + target column, where n is in the range (1, number of days that + you want predict). This function will train the model with all + the data available and will save the model that will be used to + make forecasts. Parameters ---------- target_name : str Name of the target column. data : pd.DataFrame - Dataframe with features and target column + DataFrame with features and target column. ini_date : str, optional - Determines the beggining of the train dataset, by default None + Determines the beggining of the train dataset, by default + None. end_date : str, optional - Determines the end of the train dataset, by default None + Determines the end of the train dataset, by default None. save : bool - If True the models is saved, by default True + If True the models is saved, by default True. path : str, optional - Indicates where the models will be saved, by default "../opt/models/saved_models/ml" + Indicates where the models will be saved, by default + "../opt/models/saved_models/ml". Returns ------- list - A list with the trained models + A list with the trained models. """ predict_n = self.predict_n @@ -320,34 +332,37 @@ def forecast( ) -> pd.DataFrame: """ - Function to load multiple ngboost regressor model trained with the function - `training_model` and make the forecast. + Function to load multiple ngboost regressor model trained with + the function `training_model` and make the forecast. - Important: - predict_n and max_lag need have the same value used in training_model - Only the last that of the dataset will be used to forecast the next - predict_n days. + Important: predict_n and max_lag need have the same value used + in training_model. Only the last that of the dataset will be + used to forecast the next predict_n days. Parameters ---------- target_name : str Name of the target column. data : pd.DataFrame - Dataframe with features and target column + DataFrame with features and target column. ini_date : str, optional - Determines the beggining of the train dataset, by default None + Determines the beggining of the train dataset, by default + None. end_date : str, optional - Determines the end of the train dataset, by default None + Determines the end of the train dataset, by default None. path : str, optional - Indicates where the models will be saved, by default "../opt/models/saved_models/ml" + Indicates where the models will be saved, by default + "../opt/models/saved_models/ml". Returns ------- pd.DataFrame - A dataframe with three columns regarding(and a date index): - - lower: The lower value of the confidence interval of 95% - - median: The median value of the confidence interval of 95% - - upper: The upper value of the confidence interval of 95% + A DataFrame with three columns regarding(and a date index): + + - lower: The lower value of the confidence interval of 95%. + - median: The median value of the confidence interval of + 95%. + - upper: The upper value of the confidence interval of 95%. """ df_lag = preprocess_data(data, self.look_back, None, end_date) diff --git a/epigraphhub/analysis/preprocessing.py b/epigraphhub/analysis/preprocessing.py index 488d2f6b..3b841796 100644 --- a/epigraphhub/analysis/preprocessing.py +++ b/epigraphhub/analysis/preprocessing.py @@ -1,7 +1,8 @@ #!/usr/bin/env python3 """ -The functions in this module transform the data in a format that is accepted by -ML models (tabular data) and neural network models (3D array data and multiple-output). +The functions in this module transform the data in a format that is +accepted by ML models (tabular data) and neural network models (3D array +data and multiple-output). """ from typing import Tuple, Union @@ -18,15 +19,18 @@ def build_lagged_features( dt: pd.DataFrame, maxlag: int = 2, dropna: bool = True ) -> pd.DataFrame: """ - Builds a new DataFrame to facilitate regressing over all possible lagged features + Builds a new DataFrame to facilitate regressing over all possible + lagged features. + Parameters ---------- dt : pd.DataFrame - Dataframe containing features + Dataframe containing features. maxlag : int, optional - maximum lags to compute, by default 2 + Maximum lags to compute, by default 2. dropna : bool, optional - If true the initial rows containing NANs due to lagging will be dropped, by default True + If true the initial rows containing NANs due to lagging will be + dropped, by default True. Returns ------- @@ -63,13 +67,14 @@ def preprocess_data( end_date: Union[str, None] = None, ) -> pd.DataFrame: """ - This function creates a dataframe with lagged columns that allow the + This function creates a DataFrame with lagged columns that allow the application of ML regression model. Parameters ---------- data : pd.DataFrame - Dataframe with datetime index and the target and features in the columns. + Dataframe with datetime index and the target and features in the + columns. maxlag : int The max number of days used to compute the lagged columns. ini_date : str, optional @@ -80,7 +85,7 @@ def preprocess_data( Returns ------- pd.DataFrame - The data frame with the lagged columns. + The DataFrame with the lagged columns. """ df_lag = build_lagged_features(copy.deepcopy(data), maxlag=maxlag) @@ -98,20 +103,20 @@ def preprocess_data( def get_targets(target: pd.Series, predict_n: int) -> dict: """ - Function to create a dictionary with the targets that - it will be used to train the ngboost model + Function to create a dictionary with the targets that it will be + used to train the ngboost model. Parameters ---------- target : pd.Series - array with the values used as target + Array with the values used as target. predict_n : int - Number os days that it will be predicted. + Number of days that it will be predicted. Returns ------- dictionary - A dictionary with the targets used to train the model + A dictionary with the targets used to train the model. """ targets = {} @@ -124,15 +129,17 @@ def get_targets(target: pd.Series, predict_n: int) -> dict: def get_next_n_days(ini_date: str, next_days: int) -> list: """ - Return a list of dates with the {next_days} days after ini_date. This - function was designed to generate the dates of the forecast models. + Return a list of dates with the {next_days} days after ini_date. + This function was designed to generate the dates of the forecast + models. Parameters ---------- ini_date : str - Initial date + Initial date. next_days : int - Number os days to be included in the list after the date in ini_date. + Number of days to be included in the list after the date in + ini_date. Returns ------- @@ -160,30 +167,30 @@ def lstm_split_data( Y_column: int = 0, ) -> Tuple[np.array, np.array, np.array, np.array]: """ - Split the data into training and test sets - Keras expects the input tensor to have a shape of (nb_samples, look_back, features), - and a output shape of (,predict_n) + Split the data into training and test sets. Keras expects the input + tensor to have a shape of (nb_samples, look_back, features), and a + output shape of (,predict_n). Parameters ---------- df : pd.DataFrame - Pandas dataframe with the data. + DataFrame with the data. look_back : int, optional - Number of weeks to look back before predicting. By default 12 + Number of weeks to look back before predicting. By default 12. ratio : float, optional - Fraction of total samples to use for training. By default 0.8 + Fraction of total samples to use for training. By default 0.8. predict_n : int, optional - Number of weeks to predict. By default 5 + Number of weeks to predict. By default 5. Y_column : int, optional - Column to predict. By default 0 + Column to predict. By default 0. Returns ------- Tuple[np.array,np.array,np.array,np.array] - X_train: array of features to train the model - y_train: array of targets to train the model - X_test: array of features to test the model - y_test: array of targets to test the model + X_train: array of features to train the model. + y_train: array of targets to train the model. + X_test: array of features to test the model. + y_test: array of targets to test the model. """ df = np.nan_to_num(df.values).astype("float64") @@ -212,20 +219,23 @@ def normalize_data( df: pd.DataFrame, log_transform: bool = False ) -> Tuple[pd.DataFrame, pd.Series]: """ - Normalize features in the df table and return the normalized table and the values - used to compute the normalization. + Normalize features in the df table and return the normalized table + and the values used to compute the normalization. + Parameters ---------- df : pd.DataFrame - DataFrame to be normalized by the maximum value + DataFrame to be normalized by the maximum value. log_transform : bool, optional - If true the log transformation is applied in the data, by default False + If true the log transformation is applied in the data, by + default False. Returns ------- Tuple[pd.DataFrame, pd.Series] - pd.DataFrame: normalized dataframe - pd.Series: series of the max values used in the normalization + pd.DataFrame: normalized DataFrame. + pd.Series: Series of the max + values used in the normalization. """ df.fillna(0, inplace=True) diff --git a/epigraphhub/data/colombia/extract.py b/epigraphhub/data/colombia/extract.py index fcacaac1..8abeecae 100644 --- a/epigraphhub/data/colombia/extract.py +++ b/epigraphhub/data/colombia/extract.py @@ -1,8 +1,9 @@ """ Last change on 2022/09/22 -Comparing Colombia Governmental COVID data consists in a step before pushing it to -the SQL Database. Is responsible for retrieving the last date in both CSV and SQL table. -Connects to the Colombia data through Socrata API and returns the maximum date found. +Comparing Colombia Governmental COVID data consists in a step before +pushing it to the SQL Database. Is responsible for retrieving the last +date in both CSV and SQL table. Connects to the Colombia data through +Socrata API and returns the maximum date found. Methods ------- @@ -33,15 +34,18 @@ def compare() -> bool: def _table_last_update() -> datetime: """ - This method will connect to the SQL Database and query the maximum date found in - Colombia table. - - Returns: - date (datetime) : Max date found in Colombia table. - - Raises: - Exception (Exception) : Unable to access Colombia table. - @see epigraphhub.connection + This method will connect to the SQL Database and query the maximum + date found in Colombia table. + + Returns + ------- + date : datetime + Max date found in Colombia table. + + Raises + ------ + Exception : Exception + Unable to access Colombia table. @see epigraphhub.connection """ engine = get_engine(credential_name=env.db.default_credential) try: @@ -59,14 +63,19 @@ def _table_last_update() -> datetime: def _web_last_update() -> datetime: """ - This method will request the maximum date found in Colombia data through Socrata API - and returns it as a datetime object for further evaluation. - - Returns: - date (datetime) : Max date found in Colombia data through Socrata. - - Raises: - Exception (Exception) : Unable to create Socrata request. + This method will request the maximum date found in Colombia data + through Socrata API and returns it as a datetime object for further + evaluation. + + Returns + ------- + date : datetime + Max date found in Colombia data through Socrata. + + Raises + ------ + Exception : Exception + Unable to create Socrata request. """ try: report_date = [ diff --git a/epigraphhub/data/epigraphhub_db.py b/epigraphhub/data/epigraphhub_db.py index d52ab1ee..696584e1 100644 --- a/epigraphhub/data/epigraphhub_db.py +++ b/epigraphhub/data/epigraphhub_db.py @@ -1,6 +1,6 @@ """ -The functions in this module allow the user to get the datasets stored in the -epigraphhub database. +The functions in this module allow the user to get the datasets stored +in the epigraphhub database. """ from typing import Union @@ -21,11 +21,12 @@ def get_agg_data( schema: str, table_name: str, columns: list, method: str, ini_date: str ) -> pd.DataFrame: """ - This function provides an aggregate data frame for the table selected in the param - table_name. The columns should be a list with three values. The first should be - a date column, the second a column that will be used for the aggregation - (e.g. regions name), and the third the column that will be used to compute the - result of the aggregation. + This function provides an aggregate DataFrame for the table selected + in the param table_name. The columns should be a list with three + values. The first should be a date column, the second a column that + will be used for the aggregation (e.g. regions name), and the third + the column that will be used to compute the result of the + aggregation. Parameters ---------- @@ -34,21 +35,21 @@ def get_agg_data( table_name : str The name of the table. columns : list - The list of Columns from the table that will be used in the - aggregation. The first column should be a date column, - the second should be a column with the regions name that we want - to aggregate (e.g. regions name), and the third will be used - to compute the result of aggregation. + The list of columns from the table that will be used in the + aggregation. The first column should be a date column, the + second should be a column with the regions name that we want to + aggregate (e.g. regions name), and the third will be used to + compute the result of aggregation. method : str - The method name to be applied in the aggregation the - possible options are: 'COUNT', 'SUM', and 'AVG'. + The method name to be applied in the aggregation, the possible + options are: 'COUNT', 'SUM', and 'AVG'. ini_date : str Initial data to start the aggregation. Returns ------- pd.DataFrame - The return is a pandas dataframe + The return is a pandas DataFrame. """ table_name = table_name.lower() @@ -75,8 +76,8 @@ def get_data_by_location( loc_column: str, ) -> pd.DataFrame: """ - This function provides a data frame for the table selected in the param table_name and - the chosen regions in the param georegion. + This function provides a DataFrame for the table selected in the + param table_name and the chosen regions in the param georegion. Parameters ---------- @@ -85,10 +86,11 @@ def get_data_by_location( table_name : str Name of the table that you want to get the data. loc : Union[list[str], str] - This list contains all the locations of interest or the string 'All' - to return all the regions. + This list contains all the locations of interest or the string + 'All' to return all the regions. columns : list[str], None - Columns that you want to select from the table table_name. If None all the columns will be returned. + Columns that you want to select from the table table_name. If + None all the columns will be returned. loc_column : str Name of the column to filter by location name. @@ -110,8 +112,8 @@ def get_data_by_location( if type(loc) != list and loc != "All": raise Exception( - """Error. The georegion param should be a list or the string All to - return all the georegions.""" + """Error. The georegion param should be a list or the string + 'All' to return all the georegions.""" ) if type(columns) != list and columns != None: diff --git a/epigraphhub/data/worldbank.py b/epigraphhub/data/worldbank.py index 56467df0..2dc43ec6 100644 --- a/epigraphhub/data/worldbank.py +++ b/epigraphhub/data/worldbank.py @@ -6,24 +6,28 @@ def get_pop_data(country, time="all", fx_et="5Y"): """ Function to get the population data stratified by age and sex from - the world bank data - - :params country: string. ISO-CODE of the country of interest. - - :params time: Interval of years. If filled `time = 'all'`, the function will - return all the data available. You can also specify a range - of years. For example, if you want to get the data for the - period between the years 2010 and 2020, you can fill this - parameter with `time = range(2010,2021)`. - - :params fx_et: string. If fx_et == '5Y', it will be returned the - population by 5-year age groups. - If fx_et == 'IN', it will be return the - population divided in 3 age groups. - If fx_et == 'TOTL', it will be returned the - total population without consider the age groups. - - :returns: DataFrame. + the world bank data. + + Parameters + ---------- + country : str + ISO-CODE of the country of interest. + time : range, str + Interval of years. If filled `time = 'all'`, the function will + return all the data available. You can also specify a range of + years. For example, if you want to get the data for the period + between the years 2010 and 2020, you can fill this parameter + with `time = range(2010,2021)`. + fx_et : str + If fx_et == '5Y', it will be returned the population by 5-year + age groups. If fx_et == 'IN', it will be return the population + divided in 3 age groups. If fx_et == 'TOTL', it will be returned + the total population without consider the age groups. + + Returns + ------- + pd.DataFrame + The return is a pandas DataFrame. """ fx_et = fx_et.upper() @@ -97,14 +101,18 @@ def get_pop_data(country, time="all", fx_et="5Y"): def search_in_database(keyword): """ - Returns a dataframe with the database matched. - - :params keyword: string. Full name or keyword to search in the database - names. If the string 'all' is used in the parameter keyword all the - databases names are returned. - - :returns: DataFrame. - + Returns a DataFrame with the database matched. + + Parameters + ---------- + keyword : str + Full name or keyword to search in the database names. If the + string 'all' is used, all the databases names are returned. + + Returns + ------- + pd.DataFrame + The return is a pandas DataFrame. """ results = [] @@ -127,19 +135,24 @@ def search_in_database(keyword): def search_in_indicators(keyword, db=2): - """ - Returns a dataframe with the indicators matched by partial name. - - :params keyword: string|None. keyword to search in the indicators name. If None, - all the indicators available will be returned. - :params db:int. Number associated with the database whose you want to get the list - of indicators. You can discover this number in the function 'search_in - _database'. By default the indicators are search over the World Development - Indicators database (db = 2). - - :returns: DataFrame - + Returns a DataFrame with the indicators matched by partial name. + + Parameters + ---------- + keyword : str + Keyword to search in the indicators name. If None, all the + indicators available will be returned. + db : int + Number associated with the database whose you want to get the + list of indicators. You can discover this number in the function + 'search_in_database'. By default the indicators are search over + the World Development Indicators database (db = 2). + + Returns + ------- + pd.DataFrame + The return is a pandas DataFrame. """ results = [] @@ -151,29 +164,35 @@ def search_in_indicators(keyword, db=2): def get_worldbank_data(ind, country, db=2, time="all", columns=None): - """ - This function get a list of indicators according to some country from the world data bank - and return this series in a dataframe. - - :params ind: List of strings. List with the indicators whose data you want to get. - - :params country: List of strings|string. List with the ISO-CODE for the countries whose - data you want to get. - - :params time: Interval of years. If filled `time = 'all'`, the function will - return all the data available. You can also specify a range of years. - For example, if you want to get the data for the period between the - years 2010 and 2020, you can fill this parameter with - `time = range(2010,2021)`. - - :params columns: List of strings. List with names to rename the columns instead of use the - names in the `ind` list. - - Important: The lists `ind` and `columns` must have the same lenght. - - :returns: DataFrame - + This function get a list of indicators according to some country + from the world data bank and return this series in a DataFrame. + + Parameters + ---------- + ind : list + List with the indicators whose data you want to get. + country : list + List with the ISO-CODE for the countries whose data you want to + get. + time : range, str + Interval of years. If filled `time = 'all'`, the function will + return all the data available. You can also specify a range of + years. For example, if you want to get the data for the period + between the years 2010 and 2020, you can fill this parameter + with `time = range(2010,2021)`. + columns : list + List with names to rename the columns instead of use the names + in the `ind` list. + + Important + --------- + The lists `ind` and `columns` must have the same lenght. + + Returns + ------- + pd.DataFrame + The return is a pandas DataFrame. """ if len(ind) == 1: