From 279c6ec56692cd786f1995d0c013705de1ebe652 Mon Sep 17 00:00:00 2001 From: MartinKalema Date: Tue, 11 Jun 2024 12:23:21 +0300 Subject: [PATCH] combined experiment tracking with model training --- ...03_model_evaluation.ipynb => .dockerignore | 0 configuration/configuration.yaml | 3 +- main.py | 8 +-- parameters.yaml | 2 +- research/02_model_training.ipynb | 2 - .../components/model_evaluation.py | 0 ...ng.py => model_training_and_evaluation.py} | 48 +++++++++----- .../components/prediction_service.py | 0 .../configuration/configuration.py | 18 +++-- src/swahiliNewsClassifier/entity/entities.py | 65 +++++++++++++------ .../pipeline/stage_02_model_training.py | 31 --------- .../stage_02_model_training_and_evaluation.py | 31 +++++++++ .../pipeline/stage_03_model_evaluation.py | 0 .../pipeline/stage_04_prediction.py | 0 template.py | 10 +-- 15 files changed, 127 insertions(+), 91 deletions(-) rename research/03_model_evaluation.ipynb => .dockerignore (100%) delete mode 100644 src/swahiliNewsClassifier/components/model_evaluation.py rename src/swahiliNewsClassifier/components/{model_training.py => model_training_and_evaluation.py} (69%) delete mode 100644 src/swahiliNewsClassifier/components/prediction_service.py delete mode 100644 src/swahiliNewsClassifier/pipeline/stage_02_model_training.py create mode 100644 src/swahiliNewsClassifier/pipeline/stage_02_model_training_and_evaluation.py delete mode 100644 src/swahiliNewsClassifier/pipeline/stage_03_model_evaluation.py delete mode 100644 src/swahiliNewsClassifier/pipeline/stage_04_prediction.py diff --git a/research/03_model_evaluation.ipynb b/.dockerignore similarity index 100% rename from research/03_model_evaluation.ipynb rename to .dockerignore diff --git a/configuration/configuration.yaml b/configuration/configuration.yaml index 7d31ac0..61710f8 100644 --- a/configuration/configuration.yaml +++ b/configuration/configuration.yaml @@ -6,8 +6,9 @@ data_ingestion: test_source_URL: https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing train_data_file: artifacts/data_ingestion/compressed/train_data.zip test_data_file: artifacts/data_ingestion/compressed/test_data.zip - unzip_dir: artifacts/data_ingestion/decompressed + decompressed_dir: artifacts/data_ingestion/decompressed training: root_dir: artifacts/models training_data_path: artifacts/data_ingestion/decompressed/Train.csv + testing_data_path: artifacts/data_ingestion/decompressed/Test.csv diff --git a/main.py b/main.py index dd85cd7..aa932fe 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,6 @@ from swahiliNewsClassifier import log from swahiliNewsClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline -from swahiliNewsClassifier.pipeline.stage_02_model_training import ModelTrainingPipeline -# from swahiliNewsClassifier.pipeline.stage_03_model_training import TrainingPipeline -# from swahiliNewsClassifier.pipeline.stage_04_model_evaluation import EvaluationPipeline +from swahiliNewsClassifier.pipeline.stage_02_model_training_and_evaluation import ModelTrainingAndEvaluationPipeline def run_pipeline_stage(stage_name, pipeline_class) -> None: @@ -30,6 +28,4 @@ def run_pipeline_stage(stage_name, pipeline_class) -> None: if __name__ == '__main__': run_pipeline_stage("DATA INGESTION STAGE", DataIngestionTrainingPipeline) - run_pipeline_stage("MODEL TRAINING STAGE", ModelTrainingPipeline) - # run_pipeline_stage("Model Training Stage", TrainingPipeline) - # run_pipeline_stage("Model Evaluation Stage", EvaluationPipeline) + run_pipeline_stage("MODEL TRAINING AND EVALUATION STAGE", ModelTrainingAndEvaluationPipeline) diff --git a/parameters.yaml b/parameters.yaml index ccff59f..04d9b22 100644 --- a/parameters.yaml +++ b/parameters.yaml @@ -3,7 +3,7 @@ LEARNING_RATE_2: 0.05 LEARNING_RATE_3: 0.05 LEARNING_RATE_4: 0.05 LEARNING_RATE_5: 0.03 -NUMBER_OF_CLASSES: 2 +NUMBER_OF_CLASSES: 5 EPOCHS_1: 5 EPOCHS_2: 5 EPOCHS_3: 5 diff --git a/research/02_model_training.ipynb b/research/02_model_training.ipynb index 13311e2..9d09a1c 100644 --- a/research/02_model_training.ipynb +++ b/research/02_model_training.ipynb @@ -93,7 +93,6 @@ " epochs_4: int\n", " epochs_5: int\n", " training_data: Path\n", - " number_of_classes: int\n", " root_dir: Path" ] }, @@ -142,7 +141,6 @@ " epochs_3=self.params.EPOCHS_3,\n", " epochs_4=self.params.EPOCHS_4,\n", " epochs_5=self.params.EPOCHS_5,\n", - " number_of_classes=self.params.NUMBER_OF_CLASSES,\n", "\n", " )" ] diff --git a/src/swahiliNewsClassifier/components/model_evaluation.py b/src/swahiliNewsClassifier/components/model_evaluation.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/swahiliNewsClassifier/components/model_training.py b/src/swahiliNewsClassifier/components/model_training_and_evaluation.py similarity index 69% rename from src/swahiliNewsClassifier/components/model_training.py rename to src/swahiliNewsClassifier/components/model_training_and_evaluation.py index 4a0c243..130a3fe 100644 --- a/src/swahiliNewsClassifier/components/model_training.py +++ b/src/swahiliNewsClassifier/components/model_training_and_evaluation.py @@ -1,4 +1,4 @@ -from swahiliNewsClassifier.entity.entities import ModelTrainingConfig +from swahiliNewsClassifier.entity.entities import ModelTrainingAndEvaluationConfig from swahiliNewsClassifier import log import torch import fastai @@ -12,19 +12,20 @@ from swahiliNewsClassifier import log import boto3 from dotenv import load_dotenv - +import dagshub +import mlflow load_dotenv() -class ModelTraining: - def __init__(self, model_training_config: ModelTrainingConfig): +class ModelTrainingAndEvaluation: + def __init__(self, model_training_and_evaluation_config: ModelTrainingAndEvaluationConfig): """ Initialize ModelTraining object with the provided configuration. Args: - model_training_config (ModelTrainingConfig): Configuration object for model training. + model_training_and_evaluation_config (ModelTrainingConfig): Configuration object for model training. """ - self.model_training_config = model_training_config + self.model_training_and_evaluation_config = model_training_and_evaluation_config self.bucket_name = "swahili-news-classifier" self.model_path = f"models/text_classifier_learner.pth" self.s3 = boto3.client('s3', aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), region_name=os.getenv('REGION_NAME')) @@ -47,7 +48,7 @@ def load_data(self) -> pd.DataFrame: pd.DataFrame: Loaded training data. """ log.info('Loading training data') - train = pd.read_csv(self.model_training_config.training_data) + train = pd.read_csv(self.model_training_and_evaluation_config.training_data) return train def prepare_data(self, train) -> 'tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]': @@ -60,7 +61,7 @@ def prepare_data(self, train) -> 'tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame Returns: tuple: A tuple containing training data (df_trn), validation data (df_val), and data for language model (df_lm). """ - df_trn, df_val = train_test_split(train, stratify=train['category'], test_size=self.model_training_config.test_size, random_state=123) + df_trn, df_val = train_test_split(train, stratify=train['category'], test_size=self.model_training_and_evaluation_config.test_size, random_state=123) df_lm = pd.concat([df_trn, df_val], axis=0)[['content']] return df_trn, df_val, df_lm @@ -80,7 +81,7 @@ def create_dataloaders(self, df_lm) -> DataLoaders: get_x=ColReader('text'), splitter=RandomSplitter(0.1)) - dls = dblock.dataloaders(df_lm, bs=self.model_training_config.batch_size_1) + dls = dblock.dataloaders(df_lm, bs=self.model_training_and_evaluation_config.batch_size_1) return dls def train_language_model(self, dls) -> Learner: @@ -96,7 +97,7 @@ def train_language_model(self, dls) -> Learner: log.info('Training Language Model Learner') learn = language_model_learner(dls, AWD_LSTM, drop_mult=0.3, metrics=[accuracy]).to_fp16() learn.lr_find() - learn.fine_tune(self.model_training_config.epochs_1, self.model_training_config.learning_rate_1) + learn.fine_tune(self.model_training_and_evaluation_config.epochs_1, self.model_training_and_evaluation_config.learning_rate_1) log.info('Saving best Language Model Learner.') @@ -123,7 +124,17 @@ def create_text_classifier_dataloaders(self, df_trn, dls_lm) -> DataLoaders: get_y=ColReader('category'), splitter=RandomSplitter(0.2)) - return dblock.dataloaders(df_trn, bs=self.model_training_config.batch_size_2) + return dblock.dataloaders(df_trn, bs=self.model_training_and_evaluation_config.batch_size_2) + + def log_to_mlflow(self, metrics: list) -> None: + os.environ['MLFLOW_TRACKING_URI'] = self.model_training_and_evaluation_config.mlflow_tracking_uri + + dagshub.init(repo_owner=self.model_training_and_evaluation_config.mlflow_repo_owner, repo_name=self.model_training_and_evaluation_config.mlflow_repo_name, mlflow=True) + + with mlflow.start_run(): + mlflow.log_params(self.model_training_and_evaluation_config.all_params) + mlflow.log_metric('val_loss', metrics[0]) + mlflow.log_metric('val_accuracy', metrics[1]) def train_text_classifier(self, dls) -> None: """ @@ -132,21 +143,24 @@ def train_text_classifier(self, dls) -> None: Args: dls (DataLoaders): Dataloaders for the text classifier. """ + log.info('Training Text Classifier Learner.') + learn = text_classifier_learner(dls, AWD_LSTM, metrics=[accuracy]).to_fp16() learn.load_encoder(f'language_model_learner') learn.lr_find() - learn.fit_one_cycle(self.model_training_config.epochs_2, self.model_training_config.learning_rate_2) + learn.fit_one_cycle(self.model_training_and_evaluation_config.epochs_2, self.model_training_and_evaluation_config.learning_rate_2) learn.freeze_to(-2) - learn.fit_one_cycle(self.model_training_config.epochs_3, slice(1e-3/(2.6**4), self.model_training_config.learning_rate_3)) + learn.fit_one_cycle(self.model_training_and_evaluation_config.epochs_3, slice(1e-3/(2.6**4), self.model_training_and_evaluation_config.learning_rate_3)) learn.freeze_to(-3) - learn.fit_one_cycle(self.model_training_config.epochs_4, slice(5e-3/(2.6**4), self.model_training_config.learning_rate_4)) + learn.fit_one_cycle(self.model_training_and_evaluation_config.epochs_4, slice(5e-3/(2.6**4), self.model_training_and_evaluation_config.learning_rate_4)) learn.unfreeze() - learn.fit_one_cycle(self.model_training_config.epochs_5, slice(1e-3/(2.6**4), self.model_training_config.learning_rate_5)) + learn.fit_one_cycle(self.model_training_and_evaluation_config.epochs_5, slice(1e-3/(2.6**4), self.model_training_and_evaluation_config.learning_rate_5)) + classifier_metrics = learn.validate() + self.log_to_mlflow(classifier_metrics) + learn.save_encoder(f'text_classifier_learner') - log.info("Saving best Text Classifier Learner.") - learn.save_encoder(f'text_classifier_learner') def run_pipeline(self) -> None: """ diff --git a/src/swahiliNewsClassifier/components/prediction_service.py b/src/swahiliNewsClassifier/components/prediction_service.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/swahiliNewsClassifier/configuration/configuration.py b/src/swahiliNewsClassifier/configuration/configuration.py index b94dd02..06b117c 100644 --- a/src/swahiliNewsClassifier/configuration/configuration.py +++ b/src/swahiliNewsClassifier/configuration/configuration.py @@ -1,7 +1,10 @@ from swahiliNewsClassifier.constants import * from swahiliNewsClassifier.utilities.helper_functions import read_yaml, create_directories -from swahiliNewsClassifier.entity.entities import DataIngestionConfig, ModelTrainingConfig +from swahiliNewsClassifier.entity.entities import DataIngestionConfig, ModelTrainingAndEvaluationConfig +from dotenv import load_dotenv +import os +load_dotenv() class ConfigurationManager: def __init__(self, config_filepath=CONFIG_FILE_PATH, @@ -38,16 +41,16 @@ def get_data_ingestion_config(self) -> DataIngestionConfig: unzip_dir=config.unzip_dir ) - def get_model_training_config(self) -> ModelTrainingConfig: + def get_model_training_and_evaluation_config(self) -> ModelTrainingAndEvaluationConfig: """ - Get the model training configuration. + Get the model training and evaluation configuration. Returns: - ModelTrainingConfig: Configuration object for model training. + ModelTrainingConfig: Configuration object for model training and evaluation. """ create_directories([self.config.training.root_dir]) - return ModelTrainingConfig( + return ModelTrainingAndEvaluationConfig( root_dir=self.config.training.root_dir, training_data=self.config.training.training_data_path, test_size=self.params.TEST_SIZE, @@ -63,6 +66,9 @@ def get_model_training_config(self) -> ModelTrainingConfig: epochs_3=self.params.EPOCHS_3, epochs_4=self.params.EPOCHS_4, epochs_5=self.params.EPOCHS_5, - number_of_classes=self.params.NUMBER_OF_CLASSES, + mlflow_repo_name=os.getenv('MLFLOW_REPO_NAME'), + mlflow_tracking_uri=os.getenv('MLFLOW_TRACKING_URI'), + mlflow_repo_owner=os.getenv('MLFLOW_REPO_OWNER'), + all_params=self.params, ) diff --git a/src/swahiliNewsClassifier/entity/entities.py b/src/swahiliNewsClassifier/entity/entities.py index 4cb10a8..db86462 100644 --- a/src/swahiliNewsClassifier/entity/entities.py +++ b/src/swahiliNewsClassifier/entity/entities.py @@ -13,38 +13,61 @@ class DataIngestionConfig: test_source_URL (str): The URL from which the test data will be fetched. train_data_file (Path): The local file path where the downloaded training data will be stored. test_data_file (Path): The local file path where the downloaded test data will be stored. - unzip_dir (Path): The directory where the downloaded data will be extracted or unzipped. + decompressed_dir (Path): The directory where the downloaded data will be extracted. """ root_dir: Path train_source_URL: str test_source_URL: str train_data_file: Path test_data_file: Path - unzip_dir: Path + decompressed_dir: Path @dataclass(frozen=True) -class ModelTrainingConfig: +class ModelTrainingAndEvaluationConfig: """ Configuration class for model training using ULMFiT (Universal Language Model Fine-tuning). Attributes: - test_size (float): Proportion of the dataset to include in the test split. - learning_rate_1 (float): Learning rate for training the language model learner. - learning_rate_2 (float): Learning rate for the first phase of classifier training. - learning_rate_3 (float): Learning rate for the second phase of classifier training. - learning_rate_4 (float): Learning rate for the third phase of classifier training. - learning_rate_5 (float): Learning rate for the fourth phase of classifier training. - batch_size_1 (int): Batch size for language model training. - batch_size_2 (int): Batch size for text classifier training. - epochs_1 (int): Number of epochs for training the language model learner. - epochs_2 (int): Number of epochs for the first phase of classifier training. - epochs_3 (int): Number of epochs for the second phase of classifier training. - epochs_4 (int): Number of epochs for the third phase of classifier training. - epochs_5 (int): Number of epochs for the fourth phase of classifier training. - training_data (Path): Path to the training data CSV file. - number_of_classes (int): Number of target classes in the classification task. - root_dir (Path): Root directory for storing model artifacts. + test_size (float): Proportion of the dataset to include in the test split. This parameter is used to split the dataset into training and validation sets. + + learning_rate_1 (float): Learning rate for training the language model learner. This is used during the fine-tuning of the pre-trained language model. + + learning_rate_2 (float): Learning rate for the first phase of classifier training. This is used in the initial phase of training the text classifier. + + learning_rate_3 (float): Learning rate for the second phase of classifier training. This is used in the second phase of training the text classifier. + + learning_rate_4 (float): Learning rate for the third phase of classifier training. This is used in the third phase of training the text classifier. + + learning_rate_5 (float): Learning rate for the fourth phase of classifier training. This is used in the final phase of training the text classifier. + + batch_size_1 (int): Batch size for language model training. This parameter defines the number of samples that will be propagated through the network at once during language model training. + + batch_size_2 (int): Batch size for text classifier training. This parameter defines the number of samples that will be propagated through the network at once during text classifier training. + + epochs_1 (int): Number of epochs for training the language model learner. This defines the number of complete passes through the training dataset. + + epochs_2 (int): Number of epochs for the first phase of classifier training. This defines the number of complete passes through the training dataset in the first phase. + + epochs_3 (int): Number of epochs for the second phase of classifier training. This defines the number of complete passes through the training dataset in the second phase. + + epochs_4 (int): Number of epochs for the third phase of classifier training. This defines the number of complete passes through the training dataset in the third phase. + + epochs_5 (int): Number of epochs for the fourth phase of classifier training. This defines the number of complete passes through the training dataset in the final phase. + + training_data (Path): Path to the training data CSV file. This file contains the text data and corresponding labels for training and validation. + + number_of_classes (int): Number of target classes in the classification task. This defines the number of unique labels in the dataset. + + root_dir (Path): Root directory for storing model artifacts. This directory is used to save trained models, logs, and other artifacts. + + mlflow_tracking_uri (str): URI for the MLflow tracking server. This is used to log and track experiments with MLflow. + + mlflow_repo_name (str): Repository name for MLflow tracking. This is used to organize and identify different MLflow runs within the repository. + + mlflow_repo_owner (str): Owner of the MLflow repository. This is used to identify the owner of the MLflow repository. + + all_params (dict): Dictionary containing all parameters used for model training. This includes all hyperparameters and other settings for reproducibility and logging. """ test_size: float learning_rate_1: float @@ -62,3 +85,7 @@ class ModelTrainingConfig: training_data: Path number_of_classes: int root_dir: Path + mlflow_tracking_uri: str + mlflow_repo_name: str + mlflow_repo_owner: str + all_params: dict diff --git a/src/swahiliNewsClassifier/pipeline/stage_02_model_training.py b/src/swahiliNewsClassifier/pipeline/stage_02_model_training.py deleted file mode 100644 index eca538e..0000000 --- a/src/swahiliNewsClassifier/pipeline/stage_02_model_training.py +++ /dev/null @@ -1,31 +0,0 @@ -from swahiliNewsClassifier.configuration.configuration import ConfigurationManager -from swahiliNewsClassifier.components.model_training import ModelTraining -from swahiliNewsClassifier import log - -STAGE_NAME = "Model Training Stage" - - -class ModelTrainingPipeline: - def __init__(self): - """ - Initialize the ModelTrainingPipeline object. - """ - self.config = ConfigurationManager() - - def main(self): - """ - Execute the data ingestion process. - """ - try: - model_training_config = self.config.get_model_training_config() - model_training = ModelTraining( - model_training_config=model_training_config) - model_training.run_pipeline() - except Exception as e: - log.exception(f"An error occurred during {STAGE_NAME}: {e}") - raise e - - -if __name__ == '__main__': - pipeline = ModelTrainingPipeline() - pipeline.main() diff --git a/src/swahiliNewsClassifier/pipeline/stage_02_model_training_and_evaluation.py b/src/swahiliNewsClassifier/pipeline/stage_02_model_training_and_evaluation.py new file mode 100644 index 0000000..6eecc26 --- /dev/null +++ b/src/swahiliNewsClassifier/pipeline/stage_02_model_training_and_evaluation.py @@ -0,0 +1,31 @@ +from swahiliNewsClassifier.configuration.configuration import ConfigurationManager +from swahiliNewsClassifier.components.model_training_and_evaluation import ModelTrainingAndEvaluation +from swahiliNewsClassifier import log + +STAGE_NAME = "Model Training and Evaluation Stage" + + +class ModelTrainingAndEvaluationPipeline: + def __init__(self): + """ + Initialize the ModelTrainingAndEvaluationPipeline object. + """ + self.config = ConfigurationManager() + + def main(self): + """ + Execute the model training and evaluation process. + """ + try: + model_training_and_evaluation_config = self.config.get_model_training_and_evaluation_config() + model_training_and_evaluation = ModelTrainingAndEvaluation( + model_training_and_evaluation_config=model_training_and_evaluation_config) + model_training_and_evaluation.run_pipeline() + except Exception as e: + log.exception(f"An error occurred during {STAGE_NAME}: {e}") + raise e + + +if __name__ == '__main__': + pipeline = ModelTrainingAndEvaluationPipeline() + pipeline.main() diff --git a/src/swahiliNewsClassifier/pipeline/stage_03_model_evaluation.py b/src/swahiliNewsClassifier/pipeline/stage_03_model_evaluation.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/swahiliNewsClassifier/pipeline/stage_04_prediction.py b/src/swahiliNewsClassifier/pipeline/stage_04_prediction.py deleted file mode 100644 index e69de29..0000000 diff --git a/template.py b/template.py index 0d966a4..0c905b4 100644 --- a/template.py +++ b/template.py @@ -30,18 +30,14 @@ def create_file_with_directories(filepath: Path) -> None: f"src/{project_name}/__init__.py", f"src/{project_name}/components/__init__.py", f"src/{project_name}/components/data_ingestion.py", - f"src/{project_name}/components/prediction_service.py", - f"src/{project_name}/components/model_training.py", - f"src/{project_name}/components/model_evaluation.py", + f"src/{project_name}/components/model_training_and_evaluation.py", f"src/{project_name}/utilities/_init__.py", f"src/{project_name}/utilities/helper_functions.py", f"src/{project_name}/configuration/__init__.py", f"src/{project_name}/configuration/configuration.py", f"src/{project_name}/pipeline/__init__.py", f"src/{project_name}/pipeline/stage_01_data_ingestion.py", - f"src/{project_name}/pipeline/stage_02_model_training.py", - f"src/{project_name}/pipeline/stage_03_model_evaluation.py", - f"src/{project_name}/pipeline/stage_04_prediction.py", + f"src/{project_name}/pipeline/stage_02_model_training_and_evaluation.py", f"src/{project_name}/entity/__init__.py", f"src/{project_name}/entity/entities.py", f"src/{project_name}/constants/__init__.py", @@ -57,8 +53,6 @@ def create_file_with_directories(filepath: Path) -> None: "logs/20240608-124455.log", "research/01_data_ingestion.ipynb", "research/02_model_training.ipynb", - "research/03_model_evaluation.ipynb", - "templates/index.html", "app.py", "autopep.py", ".env",