From 8e642e13386e9d491c13c6c16c65d4d4f579aa4f Mon Sep 17 00:00:00 2001 From: vanithakattumuri Date: Fri, 18 Oct 2024 19:47:17 +0530 Subject: [PATCH] #2 written the code documentation of file CSV2Parquet.py --- PAMI/extras/convert/CSV2Parquet.py | 234 ++++++++++++++++++++++++----- 1 file changed, 198 insertions(+), 36 deletions(-) diff --git a/PAMI/extras/convert/CSV2Parquet.py b/PAMI/extras/convert/CSV2Parquet.py index ab9ccd10..4e7bd24a 100644 --- a/PAMI/extras/convert/CSV2Parquet.py +++ b/PAMI/extras/convert/CSV2Parquet.py @@ -1,39 +1,34 @@ -# #csvParquet is a code used to convert temporal and utility types into sparse and dense format. -# # -# #**Importing this algorithm into a python program** -# #-------------------------------------------------------- -# # -# # from PAMI.extras.csvParquet import csvParquet as cp -# # -# # obj = cp.csvParquet(iFile, "\t", " ", " " ) -# # -# # obj.save() -# # -# # obj.csvParquet("FileName") # To generate file in form of sparse or dense -# # -# # obj.parquetFormat("FileName") # To generate file in form of sparse or dense -# # - - - - - -# __copyright__ = """ -# Copyright (C) 2021 Rage Uday Kiran - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -# """ +# csvParquet is a code used to convert temporal and utility types into sparse and dense format. +# +# **Importing this algorithm into a python program** +# +# from PAMI.extras.csvParquet import csvParquet or Parquet2CSV as cp +# +# obj = cp.CSV2Parquet(sampleDB.csv, output.parquet, sep) +# +# obj = cp.Parquet2CSV(input.parquet, sampleDB.csv, sep) +# +# obj.convert() +# +# obj.printStats() +# + +__copyright__ = """ +Copyright (C) 2021 Rage Uday Kiran + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +""" # import pandas as pd # import numpy as np @@ -354,12 +349,73 @@ class CSV2Parquet: + """ + **About this algorithm** + + :**Description**: This class is to convert CSV files into Parquet format. + + :**Reference**: + + :**Parameters**: - **inputFile** (*str*) -- *Path to the input CSV file.* + - **outputFile** (*str*) -- *Path to the output Parquet file.* + - **sep** (*str*) -- *This variable is used to distinguish items from one another. The default seperator is tab space. However, the users can override their default separator.* + + :**Attributes**: - **getMemoryUSS** (*float*) -- *Returns the memory used by the process in USS.* + - **getMemoryRSS** (*float*) -- *Returns the memory used by the process in RSS.* + - **getRuntime()** (*float*) -- *Returns the time taken to execute the conversion.* + - **printStats()** -- * Prints statistics about memory usage and runtime.* + + :**Methods**: - **convert()** -- *Reads the input file, converts it to a Parquet file, and tracks memory usage and runtime.* + + + **Execution methods** + + **Terminal command** + + .. code-block:: console + + Format: + + (.venv) $ python3 CSV2Parquet.py + + Example Usage: + + (.venv) $ python3 CSV2Parquet.py sampleDB.csv output.parquet \t + + + **Calling from a python program** + + .. code-block:: python + + import PAMI.extras.convert.CSV2Parquet as cp + + inputFile = 'sampleDB.csv' + + sep = "\t" + + outputFile = 'output.parquet' + + obj = cp.CSV2Parquet(inputFile, outputFile, sep) + + obj.convert() + + obj.printStats() + + + **Credits** + + The complete program was written by P. Likhitha and revised by Tarun Sreepada under the supervision of Professor Rage Uday Kiran. + + """ def __init__(self, inputFile, outputFile, sep): self.inputFile = inputFile self.outputFile = outputFile self.sep = sep def convert(self): + """ + This function converts the input CSV file to a data frame, which is then transformed into a Parquet file. + """ self.start = time.time() file = [] maxLen = 0 @@ -383,15 +439,38 @@ def convert(self): self.memoryRSS = process.memory_info().rss def getMemoryUSS(self): + """ + Returns the memory used by the process in USS (Unique Set Size). + + :return: The amount of memory (in bytes) used exclusively by the process + :rtype: int + """ return self.memoryUSS def getMemoryRSS(self): + """ + Returns the memory used by the process in RSS (Resident Set Size). + + :return: The total memory (in bytes) used by the process in RAM. + :rtype: int + """ return self.memoryRSS def getRuntime(self): + """ + Returns the time taken to complete the CSV to Parquet conversion. + + :return: The runtime of the conversion process in seconds. + :rtype: float + """ return self.end - self.start def printStats(self): + """ + Prints the resource usage statistics including memory consumption (USS and RSS) and the runtime. + + :return: Prints memory usage and runtime to the console. + """ print("Memory usage (USS):", self.memoryUSS) print("Memory usage (RSS):", self.memoryRSS) print("Runtime:", self.end - self.start) @@ -406,12 +485,72 @@ def printStats(self): class Parquet2CSV: + """ + **About this algorithm** + + :**Description**: This class is to convert Parquet format into CSV file. + + :**Reference**: + + :**Parameters**: - **inputFile** (*str*) -- *Path to the input Parquet file.* + - **outputFile** (*str*) -- *Path to the output CSV file.* + - **sep** (*str*) -- *This variable is used to distinguish items from one another. The default seperator is tab space. However, the users can override their default separator.* + + :**Attributes**: - **getMemoryUSS** (*int*) -- *Returns the memory used by the process in USS.* + - **getMemoryRSS** (*int*) -- *Returns the memory used by the process in RSS.* + - **getRuntime()** (*float*) -- *Returns the time taken to execute the conversion.* + - **printStats()** -- * Prints statistics about memory usage and runtime.* + + :**Methods**: - **convert()** -- *Reads the Parquet file, converts it to a CSV file, and tracks memory usage and runtime.* + + **Execution methods** + + **Terminal command** + + .. code-block:: console + + Format: + + (.venv) $ python3 CSV2Parquet.py + + Example Usage: + + (.venv) $ python3 CSV2Parquet.py output.parquet sampleDB.csv \t + + + **Calling from a python program** + + .. code-block:: python + + import PAMI.extras.convert.Parquet2CSV as pc + + inputFile = 'output.parquet' + + sep = "\t" + + outputFile = 'sampleDB.csv' + + obj = pc.Parquet2CSV(inputFile, outputFile, sep) + + obj.convert() + + obj.printStats() + + + **Credits** + + The complete program was written by P. Likhitha and revised by Tarun Sreepada under the supervision of Professor Rage Uday Kiran. + + """ def __init__(self, inputFile, outputFile, sep): self.inputFile = inputFile self.outputFile = outputFile self.sep = sep def convert(self): + """ + This function converts the input Parquet file into a CSV file where each row is joined by the specified separator and written to the output file. + """ self.start = time.time() df = pd.read_parquet(self.inputFile) @@ -427,15 +566,38 @@ def convert(self): self.memoryRSS = process.memory_info().rss def getMemoryUSS(self): + """ + Returns the memory used by the process in USS (Unique Set Size). + + :return: The amount of memory (in bytes) used exclusively by the process + :rtype: int + """ return self.memoryUSS def getMemoryRSS(self): + """ + Returns the memory used by the process in RSS (Resident Set Size). + + :return: The total memory (in bytes) used by the process in RAM. + :rtype: int + """ return self.memoryRSS def getRuntime(self): + """ + Returns the time taken to complete the Parquet to CSV conversion. + + :return: The runtime of the conversion process in seconds. + :rtype: float + """ return self.end - self.start def printStats(self): + """ + Prints the resource usage statistics including memory consumption (USS and RSS) and the runtime. + + :return: Prints memory usage and runtime to the console. + """ print("Memory usage (USS):", self.memoryUSS) print("Memory usage (RSS):", self.memoryRSS) print("Runtime:", self.end - self.start)