Skip to content

Commit

Permalink
Finished data ingestion
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinKalema committed Jun 8, 2024
1 parent 33023df commit b936ca1
Show file tree
Hide file tree
Showing 8 changed files with 210 additions and 25 deletions.
4 changes: 2 additions & 2 deletions configuration/configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ data_ingestion:
root_dir: artifacts/data_ingestion
train_source_URL: https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing
test_source_URL: https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing
train_data_file: artifacts/data_ingestion/traindata.zip
test_data_file: artifacts/data_ingestion/testdata.zip
train_data_file: artifacts/data_ingestion/train_data.zip
test_data_file: artifacts/data_ingestion/test_data.zip
unzip_dir: artifacts/data_ingestion

training:
Expand Down
30 changes: 30 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from swahiliNewsClassifier import classifierlogger
from swahiliNewsClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline
# from swahiliNewsClassifier.pipeline.stage_02_prepare_base_model import PrepareBaseModelPipeline
# from swahiliNewsClassifier.pipeline.stage_03_model_training import TrainingPipeline
# from swahiliNewsClassifier.pipeline.stage_04_model_evaluation import EvaluationPipeline

def run_pipeline_stage(stage_name, pipeline_class):
"""
Run a pipeline stage and handle logging and exceptions.
Args:
stage_name (str): The name of the stage to run.
pipeline_class (class): The class of the pipeline stage to instantiate and run.
"""
try:
classifierlogger.info("*********************************\n")
classifierlogger.info(f">>>>>> {stage_name} started <<<<<<")
pipeline = pipeline_class()
pipeline.main()
classifierlogger.info(f">>>>>> {stage_name} completed <<<<<<<\n")
classifierlogger.info("**********************************\n")
except Exception as e:
classifierlogger.exception(f"An error occurred during {stage_name}: {e}")
raise e

if __name__ == '__main__':
run_pipeline_stage("Data Ingestion Stage", DataIngestionTrainingPipeline)
# run_pipeline_stage("Prepare Base Model Stage", PrepareBaseModelPipeline)
# run_pipeline_stage("Model Training Stage", TrainingPipeline)
# run_pipeline_stage("Model Evaluation Stage", EvaluationPipeline)
53 changes: 30 additions & 23 deletions research/01_data_ingestion.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -95,31 +95,38 @@
"outputs": [],
"source": [
"class ConfigurationManager:\n",
" def __init__(\n",
" self,\n",
" config_filepath = CONFIG_FILE_PATH,\n",
" params_filepath = PARAMS_FILE_PATH):\n",
" def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):\n",
" \"\"\"\n",
" Initialize ConfigurationManager with configuration and parameter files.\n",
"\n",
" Args:\n",
" config_filepath (str): Path to the configuration YAML file.\n",
" params_filepath (str): Path to the parameters YAML file.\n",
" \"\"\"\n",
" self.config = read_yaml(config_filepath)\n",
" self.params = read_yaml(params_filepath)\n",
"\n",
" create_directories([self.config.artifacts_root])\n",
" \n",
"\n",
" def get_data_ingestion_config(self) -> DataIngestionConfig:\n",
" \"\"\"\n",
" Get the data ingestion configuration.\n",
"\n",
" Returns:\n",
" DataIngestionConfig: Configuration object for data ingestion.\n",
" \"\"\"\n",
" config = self.config.data_ingestion\n",
"\n",
" create_directories([config.root_dir])\n",
"\n",
" data_ingestion_config = DataIngestionConfig(\n",
" return DataIngestionConfig(\n",
" root_dir=config.root_dir,\n",
" train_source_URL=config.train_source_URL,\n",
" test_source_URL=config.test_source_URL,\n",
" train_data_file=config.train_data_file,\n",
" test_data_file=config.test_data_file,\n",
" unzip_dir=config.unzip_dir\n",
" )\n",
"\n",
" return data_ingestion_config"
" )\n"
]
},
{
Expand Down Expand Up @@ -206,11 +213,11 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-06-08 15:14:25,629: INFO: helper_functions:20: yaml file: configuration/configuration.yaml loaded successfully]\n",
"[2024-06-08 15:14:25,637: INFO: helper_functions:20: yaml file: parameters.yaml loaded successfully]\n",
"[2024-06-08 15:14:25,643: INFO: helper_functions:35: Created directory at: artifacts]\n",
"[2024-06-08 15:14:25,647: INFO: helper_functions:35: Created directory at: artifacts/data_ingestion]\n",
"[2024-06-08 15:14:25,649: INFO: 2052182238:23: Downloading data from https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing into file artifacts/data_ingestion/traindata.zip]\n"
"[2024-06-08 15:24:19,735: INFO: helper_functions:20: yaml file: configuration/configuration.yaml loaded successfully]\n",
"[2024-06-08 15:24:19,744: INFO: helper_functions:20: yaml file: parameters.yaml loaded successfully]\n",
"[2024-06-08 15:24:19,747: INFO: helper_functions:35: Created directory at: artifacts]\n",
"[2024-06-08 15:24:19,749: INFO: helper_functions:35: Created directory at: artifacts/data_ingestion]\n",
"[2024-06-08 15:24:19,750: INFO: 2052182238:23: Downloading data from https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing into file artifacts/data_ingestion/train_data.zip]\n"
]
},
{
Expand All @@ -219,16 +226,16 @@
"text": [
"Downloading...\n",
"From: https://drive.google.com/uc?/export=download&id=15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB\n",
"To: /media/kalema/9954-79C8/Projects/Swahili-News-Classifier/artifacts/data_ingestion/traindata.zip\n",
"100%|██████████| 3.78M/3.78M [00:03<00:00, 1.10MB/s]"
"To: /media/kalema/9954-79C8/Projects/Swahili-News-Classifier/artifacts/data_ingestion/train_data.zip\n",
"100%|██████████| 3.78M/3.78M [00:03<00:00, 1.12MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-06-08 15:14:34,967: INFO: 2052182238:29: Downloaded data from https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing into file artifacts/data_ingestion/traindata.zip]\n",
"[2024-06-08 15:14:34,970: INFO: 2052182238:23: Downloading data from https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing into file artifacts/data_ingestion/testdata.zip]\n"
"[2024-06-08 15:24:27,176: INFO: 2052182238:29: Downloaded data from https://drive.google.com/file/d/15stuLDZkXNOgBUC1rnx5yXYdVPViUjNB/view?usp=sharing into file artifacts/data_ingestion/train_data.zip]\n",
"[2024-06-08 15:24:27,178: INFO: 2052182238:23: Downloading data from https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing into file artifacts/data_ingestion/test_data.zip]\n"
]
},
{
Expand All @@ -238,15 +245,15 @@
"\n",
"Downloading...\n",
"From: https://drive.google.com/uc?/export=download&id=1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er\n",
"To: /media/kalema/9954-79C8/Projects/Swahili-News-Classifier/artifacts/data_ingestion/testdata.zip\n",
"100%|██████████| 992k/992k [00:00<00:00, 1.10MB/s]"
"To: /media/kalema/9954-79C8/Projects/Swahili-News-Classifier/artifacts/data_ingestion/test_data.zip\n",
"100%|██████████| 992k/992k [00:00<00:00, 1.19MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-06-08 15:14:39,599: INFO: 2052182238:29: Downloaded data from https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing into file artifacts/data_ingestion/testdata.zip]\n"
"[2024-06-08 15:24:30,890: INFO: 2052182238:29: Downloaded data from https://drive.google.com/file/d/1mjmYzMdnn_UwSEgTQ7i-cJ5WSOokt9Er/view?usp=sharing into file artifacts/data_ingestion/test_data.zip]\n"
]
},
{
Expand All @@ -260,8 +267,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-06-08 15:14:39,817: INFO: 2052182238:52: Extracted zip file artifacts/data_ingestion/traindata.zip into: artifacts/data_ingestion]\n",
"[2024-06-08 15:14:39,862: INFO: 2052182238:52: Extracted zip file artifacts/data_ingestion/testdata.zip into: artifacts/data_ingestion]\n"
"[2024-06-08 15:24:31,090: INFO: 2052182238:52: Extracted zip file artifacts/data_ingestion/train_data.zip into: artifacts/data_ingestion]\n",
"[2024-06-08 15:24:31,150: INFO: 2052182238:52: Extracted zip file artifacts/data_ingestion/test_data.zip into: artifacts/data_ingestion]\n"
]
}
],
Expand Down
61 changes: 61 additions & 0 deletions src/swahiliNewsClassifier/components/data_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os
import zipfile
import gdown
from swahiliNewsClassifier.entity.entities import DataIngestionConfig
from swahiliNewsClassifier import classifierlogger

class DataIngestion:
def __init__(self, config: DataIngestionConfig):
"""
Initialize DataIngestion object with the provided configuration.
Args:
config (DataIngestionConfig): Configuration object for data ingestion.
"""
self.config = config

def download_file(self):
"""Fetch data from a URL.
Raises:
Exception: If an error occurs during the download process.
"""
os.makedirs("artifacts/data_ingestion", exist_ok=True)
dataset_urls = [self.config.train_source_URL, self.config.test_source_URL]
zip_download_dir = [self.config.train_data_file, self.config.test_data_file]

for url, dest in zip(dataset_urls, zip_download_dir):
try:
classifierlogger.info(f"Downloading data from {url} into file {dest}")

file_id = url.split("/")[-2]
prefix = "https://drive.google.com/uc?/export=download&id="
gdown.download(prefix + file_id, dest)

classifierlogger.info(f"Downloaded data from {url} into file {dest}")
except Exception as e:
classifierlogger.error(f"Error downloading file from {url} to {dest}")
raise e

def extract_zip_file(self):
"""Extract a zip file.
This method extracts the contents of a zip file specified in the configuration
to the directory specified in the configuration.
Raises:
Exception: If an error occurs during the extraction process.
"""
zip_download_dir = [self.config.train_data_file, self.config.test_data_file]
unzip_path = self.config.unzip_dir
os.makedirs(unzip_path, exist_ok=True)

for zip_file in zip_download_dir:
try:
with zipfile.ZipFile(zip_file, "r") as zip_ref:
zip_ref.extractall(unzip_path)

classifierlogger.info(f"Extracted zip file {zip_file} into: {unzip_path}")
except Exception as e:
classifierlogger.error(f"Error extracting zip file: {zip_file}")
raise e
37 changes: 37 additions & 0 deletions src/swahiliNewsClassifier/configuration/configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from swahiliNewsClassifier.constants import *
from swahiliNewsClassifier.utilities.helper_functions import read_yaml, create_directories
from swahiliNewsClassifier.entity.entities import DataIngestionConfig

class ConfigurationManager:
def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
"""
Initialize ConfigurationManager with configuration and parameter files.
Args:
config_filepath (str): Path to the configuration YAML file.
params_filepath (str): Path to the parameters YAML file.
"""
self.config = read_yaml(config_filepath)
self.params = read_yaml(params_filepath)

create_directories([self.config.artifacts_root])

def get_data_ingestion_config(self) -> DataIngestionConfig:
"""
Get the data ingestion configuration.
Returns:
DataIngestionConfig: Configuration object for data ingestion.
"""
config = self.config.data_ingestion

create_directories([config.root_dir])

return DataIngestionConfig(
root_dir=config.root_dir,
train_source_URL=config.train_source_URL,
test_source_URL=config.test_source_URL,
train_data_file=config.train_data_file,
test_data_file=config.test_data_file,
unzip_dir=config.unzip_dir
)
19 changes: 19 additions & 0 deletions src/swahiliNewsClassifier/entity/entities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
"""
Configuration class for data ingestion process.
Attributes:
root_dir (Path): The root directory where data will be stored or processed.
source_URL (str): The URL from which data will be fetched.
local_data_file (Path): The local file path where the downloaded data will be stored.
unzip_dir (Path): The directory where the downloaded data will be extracted or unzipped.
"""
root_dir: Path
source_URL: str
local_data_file: Path
unzip_dir: Path
Empty file.
31 changes: 31 additions & 0 deletions src/swahiliNewsClassifier/pipeline/stage_01_data_ingestion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from swahiliNewsClassifier.configuration.configuration import ConfigurationManager
from swahiliNewsClassifier.components.data_ingestion import DataIngestion
from swahiliNewsClassifier import classifierlogger

STAGE_NAME = "Data Ingestion Stage"

class DataIngestionTrainingPipeline:
def __init__(self):
"""
Initialize the DataIngestionTrainingPipeline object.
"""
self.config = ConfigurationManager()

def main(self):
"""
Execute the data ingestion process.
"""
try:
classifierlogger.info(f"Starting {STAGE_NAME}")
data_ingestion_config = self.config.get_data_ingestion_config()
data_ingestion = DataIngestion(config=data_ingestion_config)
data_ingestion.download_file()
data_ingestion.extract_zip_file()
classifierlogger.info(f"Completed {STAGE_NAME}\n\n**********************************")
except Exception as e:
classifierlogger.exception(f"An error occurred during {STAGE_NAME}: {e}")
raise e

if __name__ == '__main__':
pipeline = DataIngestionTrainingPipeline()
pipeline.main()

0 comments on commit b936ca1

Please sign in to comment.