From b936ca17a89cd7227e0e4a20b9b810093b7683f1 Mon Sep 17 00:00:00 2001 From: MartinKalema Date: Sat, 8 Jun 2024 15:58:23 +0300 Subject: [PATCH] Finished data ingestion --- configuration/configuration.yaml | 4 +- main.py | 30 +++++++++ research/01_data_ingestion.ipynb | 53 +++++++++------- .../components/data_ingestion.py | 61 +++++++++++++++++++ .../configuration/configuration.py | 37 +++++++++++ src/swahiliNewsClassifier/entity/entities.py | 19 ++++++ src/swahiliNewsClassifier/entity/entity.py | 0 .../pipeline/stage_01_data_ingestion.py | 31 ++++++++++ 8 files changed, 210 insertions(+), 25 deletions(-) create mode 100644 src/swahiliNewsClassifier/entity/entities.py delete mode 100644 src/swahiliNewsClassifier/entity/entity.py diff --git a/configuration/configuration.yaml b/configuration/configuration.yaml index f5d7afd..ccd980d 100644 --- a/configuration/configuration.yaml +++ b/configuration/configuration.yaml @@ -4,8 +4,8 @@ data_ingestion: root_dir: artifacts/data_ingestion train_source_URL: https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing test_source_URL: https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing - train_data_file: artifacts/data_ingestion/traindata.zip - test_data_file: artifacts/data_ingestion/testdata.zip + train_data_file: artifacts/data_ingestion/train_data.zip + test_data_file: artifacts/data_ingestion/test_data.zip unzip_dir: artifacts/data_ingestion training: diff --git a/main.py b/main.py index e69de29..70f4112 100644 --- a/main.py +++ b/main.py @@ -0,0 +1,30 @@ +from swahiliNewsClassifier import classifierlogger +from swahiliNewsClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline +# from swahiliNewsClassifier.pipeline.stage_02_prepare_base_model import PrepareBaseModelPipeline +# from swahiliNewsClassifier.pipeline.stage_03_model_training import TrainingPipeline +# from swahiliNewsClassifier.pipeline.stage_04_model_evaluation import EvaluationPipeline + +def run_pipeline_stage(stage_name, pipeline_class): + """ + Run a pipeline stage and handle logging and exceptions. + + Args: + stage_name (str): The name of the stage to run. + pipeline_class (class): The class of the pipeline stage to instantiate and run. + """ + try: + classifierlogger.info("*********************************\n") + classifierlogger.info(f">>>>>> {stage_name} started <<<<<<") + pipeline = pipeline_class() + pipeline.main() + classifierlogger.info(f">>>>>> {stage_name} completed <<<<<<<\n") + classifierlogger.info("**********************************\n") + except Exception as e: + classifierlogger.exception(f"An error occurred during {stage_name}: {e}") + raise e + +if __name__ == '__main__': + run_pipeline_stage("Data Ingestion Stage", DataIngestionTrainingPipeline) + # run_pipeline_stage("Prepare Base Model Stage", PrepareBaseModelPipeline) + # run_pipeline_stage("Model Training Stage", TrainingPipeline) + # run_pipeline_stage("Model Evaluation Stage", EvaluationPipeline) diff --git a/research/01_data_ingestion.ipynb b/research/01_data_ingestion.ipynb index 9d61193..3c0eda4 100644 --- a/research/01_data_ingestion.ipynb +++ b/research/01_data_ingestion.ipynb @@ -95,31 +95,38 @@ "outputs": [], "source": [ "class ConfigurationManager:\n", - " def __init__(\n", - " self,\n", - " config_filepath = CONFIG_FILE_PATH,\n", - " params_filepath = PARAMS_FILE_PATH):\n", + " def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):\n", + " \"\"\"\n", + " Initialize ConfigurationManager with configuration and parameter files.\n", "\n", + " Args:\n", + " config_filepath (str): Path to the configuration YAML file.\n", + " params_filepath (str): Path to the parameters YAML file.\n", + " \"\"\"\n", " self.config = read_yaml(config_filepath)\n", " self.params = read_yaml(params_filepath)\n", "\n", " create_directories([self.config.artifacts_root])\n", - " \n", + "\n", " def get_data_ingestion_config(self) -> DataIngestionConfig:\n", + " \"\"\"\n", + " Get the data ingestion configuration.\n", + "\n", + " Returns:\n", + " DataIngestionConfig: Configuration object for data ingestion.\n", + " \"\"\"\n", " config = self.config.data_ingestion\n", "\n", " create_directories([config.root_dir])\n", "\n", - " data_ingestion_config = DataIngestionConfig(\n", + " return DataIngestionConfig(\n", " root_dir=config.root_dir,\n", " train_source_URL=config.train_source_URL,\n", " test_source_URL=config.test_source_URL,\n", " train_data_file=config.train_data_file,\n", " test_data_file=config.test_data_file,\n", " unzip_dir=config.unzip_dir\n", - " )\n", - "\n", - " return data_ingestion_config" + " )\n" ] }, { @@ -206,11 +213,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2024-06-08 15:14:25,629: INFO: helper_functions:20: yaml file: configuration/configuration.yaml loaded successfully]\n", - "[2024-06-08 15:14:25,637: INFO: helper_functions:20: yaml file: parameters.yaml loaded successfully]\n", - "[2024-06-08 15:14:25,643: INFO: helper_functions:35: Created directory at: artifacts]\n", - "[2024-06-08 15:14:25,647: INFO: helper_functions:35: Created directory at: artifacts/data_ingestion]\n", - "[2024-06-08 15:14:25,649: INFO: 2052182238:23: Downloading data from https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing into file artifacts/data_ingestion/traindata.zip]\n" + "[2024-06-08 15:24:19,735: INFO: helper_functions:20: yaml file: configuration/configuration.yaml loaded successfully]\n", + "[2024-06-08 15:24:19,744: INFO: helper_functions:20: yaml file: parameters.yaml loaded successfully]\n", + "[2024-06-08 15:24:19,747: INFO: helper_functions:35: Created directory at: artifacts]\n", + "[2024-06-08 15:24:19,749: INFO: helper_functions:35: Created directory at: artifacts/data_ingestion]\n", + "[2024-06-08 15:24:19,750: INFO: 2052182238:23: Downloading data from https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing into file artifacts/data_ingestion/train_data.zip]\n" ] }, { @@ -219,16 +226,16 @@ "text": [ "Downloading...\n", "From: https://drive.google.com/uc?/export=download&id=15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB\n", - "To: /media/kalema/9954-79C8/Projects/Swahili-News-Classifier/artifacts/data_ingestion/traindata.zip\n", - "100%|██████████| 3.78M/3.78M [00:03<00:00, 1.10MB/s]" + "To: /media/kalema/9954-79C8/Projects/Swahili-News-Classifier/artifacts/data_ingestion/train_data.zip\n", + "100%|██████████| 3.78M/3.78M [00:03<00:00, 1.12MB/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[2024-06-08 15:14:34,967: INFO: 2052182238:29: Downloaded data from https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing into file artifacts/data_ingestion/traindata.zip]\n", - "[2024-06-08 15:14:34,970: INFO: 2052182238:23: Downloading data from https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing into file artifacts/data_ingestion/testdata.zip]\n" + "[2024-06-08 15:24:27,176: INFO: 2052182238:29: Downloaded data from https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing into file artifacts/data_ingestion/train_data.zip]\n", + "[2024-06-08 15:24:27,178: INFO: 2052182238:23: Downloading data from https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing into file artifacts/data_ingestion/test_data.zip]\n" ] }, { @@ -238,15 +245,15 @@ "\n", "Downloading...\n", "From: https://drive.google.com/uc?/export=download&id=1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er\n", - "To: /media/kalema/9954-79C8/Projects/Swahili-News-Classifier/artifacts/data_ingestion/testdata.zip\n", - "100%|██████████| 992k/992k [00:00<00:00, 1.10MB/s]" + "To: /media/kalema/9954-79C8/Projects/Swahili-News-Classifier/artifacts/data_ingestion/test_data.zip\n", + "100%|██████████| 992k/992k [00:00<00:00, 1.19MB/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "[2024-06-08 15:14:39,599: INFO: 2052182238:29: Downloaded data from https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing into file artifacts/data_ingestion/testdata.zip]\n" + "[2024-06-08 15:24:30,890: INFO: 2052182238:29: Downloaded data from https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing into file artifacts/data_ingestion/test_data.zip]\n" ] }, { @@ -260,8 +267,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "[2024-06-08 15:14:39,817: INFO: 2052182238:52: Extracted zip file artifacts/data_ingestion/traindata.zip into: artifacts/data_ingestion]\n", - "[2024-06-08 15:14:39,862: INFO: 2052182238:52: Extracted zip file artifacts/data_ingestion/testdata.zip into: artifacts/data_ingestion]\n" + "[2024-06-08 15:24:31,090: INFO: 2052182238:52: Extracted zip file artifacts/data_ingestion/train_data.zip into: artifacts/data_ingestion]\n", + "[2024-06-08 15:24:31,150: INFO: 2052182238:52: Extracted zip file artifacts/data_ingestion/test_data.zip into: artifacts/data_ingestion]\n" ] } ], diff --git a/src/swahiliNewsClassifier/components/data_ingestion.py b/src/swahiliNewsClassifier/components/data_ingestion.py index e69de29..bc02986 100644 --- a/src/swahiliNewsClassifier/components/data_ingestion.py +++ b/src/swahiliNewsClassifier/components/data_ingestion.py @@ -0,0 +1,61 @@ +import os +import zipfile +import gdown +from swahiliNewsClassifier.entity.entities import DataIngestionConfig +from swahiliNewsClassifier import classifierlogger + +class DataIngestion: + def __init__(self, config: DataIngestionConfig): + """ + Initialize DataIngestion object with the provided configuration. + + Args: + config (DataIngestionConfig): Configuration object for data ingestion. + """ + self.config = config + + def download_file(self): + """Fetch data from a URL. + + Raises: + Exception: If an error occurs during the download process. + """ + os.makedirs("artifacts/data_ingestion", exist_ok=True) + dataset_urls = [self.config.train_source_URL, self.config.test_source_URL] + zip_download_dir = [self.config.train_data_file, self.config.test_data_file] + + for url, dest in zip(dataset_urls, zip_download_dir): + try: + classifierlogger.info(f"Downloading data from {url} into file {dest}") + + file_id = url.split("/")[-2] + prefix = "https://drive.google.com/uc?/export=download&id=" + gdown.download(prefix + file_id, dest) + + classifierlogger.info(f"Downloaded data from {url} into file {dest}") + except Exception as e: + classifierlogger.error(f"Error downloading file from {url} to {dest}") + raise e + + def extract_zip_file(self): + """Extract a zip file. + + This method extracts the contents of a zip file specified in the configuration + to the directory specified in the configuration. + + Raises: + Exception: If an error occurs during the extraction process. + """ + zip_download_dir = [self.config.train_data_file, self.config.test_data_file] + unzip_path = self.config.unzip_dir + os.makedirs(unzip_path, exist_ok=True) + + for zip_file in zip_download_dir: + try: + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(unzip_path) + + classifierlogger.info(f"Extracted zip file {zip_file} into: {unzip_path}") + except Exception as e: + classifierlogger.error(f"Error extracting zip file: {zip_file}") + raise e diff --git a/src/swahiliNewsClassifier/configuration/configuration.py b/src/swahiliNewsClassifier/configuration/configuration.py index e69de29..1062d79 100644 --- a/src/swahiliNewsClassifier/configuration/configuration.py +++ b/src/swahiliNewsClassifier/configuration/configuration.py @@ -0,0 +1,37 @@ +from swahiliNewsClassifier.constants import * +from swahiliNewsClassifier.utilities.helper_functions import read_yaml, create_directories +from swahiliNewsClassifier.entity.entities import DataIngestionConfig + +class ConfigurationManager: + def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH): + """ + Initialize ConfigurationManager with configuration and parameter files. + + Args: + config_filepath (str): Path to the configuration YAML file. + params_filepath (str): Path to the parameters YAML file. + """ + self.config = read_yaml(config_filepath) + self.params = read_yaml(params_filepath) + + create_directories([self.config.artifacts_root]) + + def get_data_ingestion_config(self) -> DataIngestionConfig: + """ + Get the data ingestion configuration. + + Returns: + DataIngestionConfig: Configuration object for data ingestion. + """ + config = self.config.data_ingestion + + create_directories([config.root_dir]) + + return DataIngestionConfig( + root_dir=config.root_dir, + train_source_URL=config.train_source_URL, + test_source_URL=config.test_source_URL, + train_data_file=config.train_data_file, + test_data_file=config.test_data_file, + unzip_dir=config.unzip_dir + ) diff --git a/src/swahiliNewsClassifier/entity/entities.py b/src/swahiliNewsClassifier/entity/entities.py new file mode 100644 index 0000000..e0dd779 --- /dev/null +++ b/src/swahiliNewsClassifier/entity/entities.py @@ -0,0 +1,19 @@ +from dataclasses import dataclass +from pathlib import Path + + +@dataclass(frozen=True) +class DataIngestionConfig: + """ + Configuration class for data ingestion process. + + Attributes: + root_dir (Path): The root directory where data will be stored or processed. + source_URL (str): The URL from which data will be fetched. + local_data_file (Path): The local file path where the downloaded data will be stored. + unzip_dir (Path): The directory where the downloaded data will be extracted or unzipped. + """ + root_dir: Path + source_URL: str + local_data_file: Path + unzip_dir: Path \ No newline at end of file diff --git a/src/swahiliNewsClassifier/entity/entity.py b/src/swahiliNewsClassifier/entity/entity.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/swahiliNewsClassifier/pipeline/stage_01_data_ingestion.py b/src/swahiliNewsClassifier/pipeline/stage_01_data_ingestion.py index e69de29..99b9f4c 100644 --- a/src/swahiliNewsClassifier/pipeline/stage_01_data_ingestion.py +++ b/src/swahiliNewsClassifier/pipeline/stage_01_data_ingestion.py @@ -0,0 +1,31 @@ +from swahiliNewsClassifier.configuration.configuration import ConfigurationManager +from swahiliNewsClassifier.components.data_ingestion import DataIngestion +from swahiliNewsClassifier import classifierlogger + +STAGE_NAME = "Data Ingestion Stage" + +class DataIngestionTrainingPipeline: + def __init__(self): + """ + Initialize the DataIngestionTrainingPipeline object. + """ + self.config = ConfigurationManager() + + def main(self): + """ + Execute the data ingestion process. + """ + try: + classifierlogger.info(f"Starting {STAGE_NAME}") + data_ingestion_config = self.config.get_data_ingestion_config() + data_ingestion = DataIngestion(config=data_ingestion_config) + data_ingestion.download_file() + data_ingestion.extract_zip_file() + classifierlogger.info(f"Completed {STAGE_NAME}\n\n**********************************") + except Exception as e: + classifierlogger.exception(f"An error occurred during {STAGE_NAME}: {e}") + raise e + +if __name__ == '__main__': + pipeline = DataIngestionTrainingPipeline() + pipeline.main()