Skip to content

Commit

Permalink
Merge pull request #535 from vanithakattumuri/main
Browse files Browse the repository at this point in the history
#2 Code documentation of  CSV2Parquet.py file.
  • Loading branch information
udayRage authored Oct 18, 2024
2 parents f773d23 + 8e642e1 commit eac0d20
Showing 1 changed file with 198 additions and 36 deletions.
234 changes: 198 additions & 36 deletions PAMI/extras/convert/CSV2Parquet.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,34 @@
# #csvParquet is a code used to convert temporal and utility types into sparse and dense format.
# #
# #**Importing this algorithm into a python program**
# #--------------------------------------------------------
# #
# # from PAMI.extras.csvParquet import csvParquet as cp
# #
# # obj = cp.csvParquet(iFile, "\t", " ", " " )
# #
# # obj.save()
# #
# # obj.csvParquet("FileName") # To generate file in form of sparse or dense
# #
# # obj.parquetFormat("FileName") # To generate file in form of sparse or dense
# #





# __copyright__ = """
# Copyright (C) 2021 Rage Uday Kiran

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
# """
# csvParquet is a code used to convert temporal and utility types into sparse and dense format.
#
# **Importing this algorithm into a python program**
#
# from PAMI.extras.csvParquet import csvParquet or Parquet2CSV as cp
#
# obj = cp.CSV2Parquet(sampleDB.csv, output.parquet, sep)
#
# obj = cp.Parquet2CSV(input.parquet, sampleDB.csv, sep)
#
# obj.convert()
#
# obj.printStats()
#

__copyright__ = """
Copyright (C) 2021 Rage Uday Kiran
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""

# import pandas as pd
# import numpy as np
Expand Down Expand Up @@ -354,12 +349,73 @@


class CSV2Parquet:
"""
**About this algorithm**
:**Description**: This class is to convert CSV files into Parquet format.
:**Reference**:
:**Parameters**: - **inputFile** (*str*) -- *Path to the input CSV file.*
- **outputFile** (*str*) -- *Path to the output Parquet file.*
- **sep** (*str*) -- *This variable is used to distinguish items from one another. The default seperator is tab space. However, the users can override their default separator.*
:**Attributes**: - **getMemoryUSS** (*float*) -- *Returns the memory used by the process in USS.*
- **getMemoryRSS** (*float*) -- *Returns the memory used by the process in RSS.*
- **getRuntime()** (*float*) -- *Returns the time taken to execute the conversion.*
- **printStats()** -- * Prints statistics about memory usage and runtime.*
:**Methods**: - **convert()** -- *Reads the input file, converts it to a Parquet file, and tracks memory usage and runtime.*
**Execution methods**
**Terminal command**
.. code-block:: console
Format:
(.venv) $ python3 CSV2Parquet.py <inputFile> <outputFile> <sep>
Example Usage:
(.venv) $ python3 CSV2Parquet.py sampleDB.csv output.parquet \t
**Calling from a python program**
.. code-block:: python
import PAMI.extras.convert.CSV2Parquet as cp
inputFile = 'sampleDB.csv'
sep = "\t"
outputFile = 'output.parquet'
obj = cp.CSV2Parquet(inputFile, outputFile, sep)
obj.convert()
obj.printStats()
**Credits**
The complete program was written by P. Likhitha and revised by Tarun Sreepada under the supervision of Professor Rage Uday Kiran.
"""
def __init__(self, inputFile, outputFile, sep):
self.inputFile = inputFile
self.outputFile = outputFile
self.sep = sep

def convert(self):
"""
This function converts the input CSV file to a data frame, which is then transformed into a Parquet file.
"""
self.start = time.time()
file = []
maxLen = 0
Expand All @@ -383,15 +439,38 @@ def convert(self):
self.memoryRSS = process.memory_info().rss

def getMemoryUSS(self):
"""
Returns the memory used by the process in USS (Unique Set Size).
:return: The amount of memory (in bytes) used exclusively by the process
:rtype: int
"""
return self.memoryUSS

def getMemoryRSS(self):
"""
Returns the memory used by the process in RSS (Resident Set Size).
:return: The total memory (in bytes) used by the process in RAM.
:rtype: int
"""
return self.memoryRSS

def getRuntime(self):
"""
Returns the time taken to complete the CSV to Parquet conversion.
:return: The runtime of the conversion process in seconds.
:rtype: float
"""
return self.end - self.start

def printStats(self):
"""
Prints the resource usage statistics including memory consumption (USS and RSS) and the runtime.
:return: Prints memory usage and runtime to the console.
"""
print("Memory usage (USS):", self.memoryUSS)
print("Memory usage (RSS):", self.memoryRSS)
print("Runtime:", self.end - self.start)
Expand All @@ -406,12 +485,72 @@ def printStats(self):


class Parquet2CSV:
"""
**About this algorithm**
:**Description**: This class is to convert Parquet format into CSV file.
:**Reference**:
:**Parameters**: - **inputFile** (*str*) -- *Path to the input Parquet file.*
- **outputFile** (*str*) -- *Path to the output CSV file.*
- **sep** (*str*) -- *This variable is used to distinguish items from one another. The default seperator is tab space. However, the users can override their default separator.*
:**Attributes**: - **getMemoryUSS** (*int*) -- *Returns the memory used by the process in USS.*
- **getMemoryRSS** (*int*) -- *Returns the memory used by the process in RSS.*
- **getRuntime()** (*float*) -- *Returns the time taken to execute the conversion.*
- **printStats()** -- * Prints statistics about memory usage and runtime.*
:**Methods**: - **convert()** -- *Reads the Parquet file, converts it to a CSV file, and tracks memory usage and runtime.*
**Execution methods**
**Terminal command**
.. code-block:: console
Format:
(.venv) $ python3 CSV2Parquet.py <inputFile> <outputFile> <sep>
Example Usage:
(.venv) $ python3 CSV2Parquet.py output.parquet sampleDB.csv \t
**Calling from a python program**
.. code-block:: python
import PAMI.extras.convert.Parquet2CSV as pc
inputFile = 'output.parquet'
sep = "\t"
outputFile = 'sampleDB.csv'
obj = pc.Parquet2CSV(inputFile, outputFile, sep)
obj.convert()
obj.printStats()
**Credits**
The complete program was written by P. Likhitha and revised by Tarun Sreepada under the supervision of Professor Rage Uday Kiran.
"""
def __init__(self, inputFile, outputFile, sep):
self.inputFile = inputFile
self.outputFile = outputFile
self.sep = sep

def convert(self):
"""
This function converts the input Parquet file into a CSV file where each row is joined by the specified separator and written to the output file.
"""
self.start = time.time()
df = pd.read_parquet(self.inputFile)

Expand All @@ -427,15 +566,38 @@ def convert(self):
self.memoryRSS = process.memory_info().rss

def getMemoryUSS(self):
"""
Returns the memory used by the process in USS (Unique Set Size).
:return: The amount of memory (in bytes) used exclusively by the process
:rtype: int
"""
return self.memoryUSS

def getMemoryRSS(self):
"""
Returns the memory used by the process in RSS (Resident Set Size).
:return: The total memory (in bytes) used by the process in RAM.
:rtype: int
"""
return self.memoryRSS

def getRuntime(self):
"""
Returns the time taken to complete the Parquet to CSV conversion.
:return: The runtime of the conversion process in seconds.
:rtype: float
"""
return self.end - self.start

def printStats(self):
"""
Prints the resource usage statistics including memory consumption (USS and RSS) and the runtime.
:return: Prints memory usage and runtime to the console.
"""
print("Memory usage (USS):", self.memoryUSS)
print("Memory usage (RSS):", self.memoryRSS)
print("Runtime:", self.end - self.start)
Expand Down

0 comments on commit eac0d20

Please sign in to comment.