From e8274a3fd03adc9fd7aed7f256f72a1805610122 Mon Sep 17 00:00:00 2001 From: RAGE UDAY KIRAN <52146396+udayRage@users.noreply.github.com> Date: Sat, 9 Nov 2024 14:31:05 +0900 Subject: [PATCH] #300 bug resolved --- .../TemporalDatabase.py | 308 ++++++++---------- 1 file changed, 131 insertions(+), 177 deletions(-) diff --git a/PAMI/extras/syntheticDataGenerator/TemporalDatabase.py b/PAMI/extras/syntheticDataGenerator/TemporalDatabase.py index bfb00624..4651b40c 100644 --- a/PAMI/extras/syntheticDataGenerator/TemporalDatabase.py +++ b/PAMI/extras/syntheticDataGenerator/TemporalDatabase.py @@ -1,151 +1,27 @@ -# TemporalDatabase is a collection of timestamps and along with data at particular time. -# -# **Importing this algorithm into a python program** -# -# from PAMI.extras.syntheticDataGenerator import TemporalDatabase as db -# -# temporalDB = db.TemporalDatabase(numOfTransactions, avgTransactionLength, numItems, outFileName, percentage, sep, occurrenceProbabilityAtSameTimestamp, occurrenceProbabilityToSkipSubsequentTimestamp) -# -# temporalDB.create() -# -# - - -__copyright__ = """ - Copyright (C) 2021 Rage Uday Kiran - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - You should have received a copy of the GNU General Public License - along with this program. If not, see . -""" - import pandas as pd import numpy as np import sys +import time +import os +import psutil class TemporalDatabase: - """ - :Description: - creates a temporal database with required parameter (e.g.,databaseSize, avgItemsPerTransaction, numItems and outputFile). - - output can be printed in two ways either in text file or dataframe depending on the input type. - - :Attributes: - - :param databaseSize: int - number of transactions - - :param avgItemsPerTransaction: int - average length of transactions - - This class generates a temporal database based on the given parameters and provides - options to output the database in either a text file or a DataFrame format. - - **Importing this algorithm into a Python program** - - - from PAMI.extras.syntheticDataGenerator import TemporalDatabase as db - - temporalDB = db.TemporalDatabase(numOfTransactions, avgTransactionLength, numItems, outFileName, percentage, sep, occurrenceProbabilityAtSameTimestamp, occurrenceProbabilityToSkipSubsequentTimestamp) - - temporalDB.create() - - - **Methods to execute code on terminal** - - - Format: - - (.venv) $ python3 TemporalDatabase.py - - - Example Usage: - (.venv) $ python3 TemporalDatabase.py 50 10 100 temporal.txt 50 \t database 0.1 0.1 - - - :param databaseSize: int - Number of transactions to generate. - - :param avgItemsPerTransaction: int - Average length of transactions. - - :param numItems: int - Number of items in the database. - - :param outputFile: str - Name of the output file for the database. - - :param percentage: int - Percentage for the coin toss to decide if a transaction will be included in the output. - If the value is greater than 1, it is treated as a percentage (i.e., 50 for 50%). - - :param sep: str - Separator for the output file (default is tab). - - :param typeOfFile: str - Type of output file. Can be 'database' for a text file or 'dataframe' for a DataFrame output. - - :param occurrenceProbabilityAtSameTimestamp: float - Probability that a new transaction will occur at the same timestamp as the previous one. - - :param occurrenceProbabilityToSkipSubsequentTimestamp: float - Probability that the timestamp will be skipped for subsequent transactions. - """ - - def __init__(self, databaseSize: int, avgItemsPerTransaction: int, - numItems: int, outputFile: str, percentage: int = 50, - sep: str = '\t', typeOfFile: str = "Database", + def __init__(self, numOfTransactions: int, + avgLenOfTransactions: int, + numItems: int, + sep: str = '\t', occurrenceProbabilityAtSameTimestamp: float = 0.1, occurrenceProbabilityToSkipSubsequentTimestamp: float = 0.1) -> None: - """ - Initialize the TemporalDatabase with required parameters. - - :param databaseSize: Number of transactions to generate. - :param avgItemsPerTransaction: Average length of transactions. - :param numItems: Number of items in the database. - :param outputFile: Name of the output file for the database. - :param percentage: Percentage for the coin toss to include transactions. - :param sep: Separator for the output file. - :param typeOfFile: Type of output file ('database' or 'dataframe'). - :param occurrenceProbabilityAtSameTimestamp: Probability for same timestamp. - :param occurrenceProbabilityToSkipSubsequentTimestamp: Probability to skip subsequent timestamp. - """ - self.databaseSize = databaseSize - self.avgItemsPerTransaction = avgItemsPerTransaction + self.numOfTransactions = numOfTransactions + self.avgItemsPerTransaction = avgLenOfTransactions self.numItems = numItems - self.outputFile = outputFile - if percentage > 1: - self.percentage = percentage / 100 - else: - self.percentage = percentage self.sep = sep - self.typeOfFile = typeOfFile.lower() self.occurrenceProbabilityAtSameTimestamp = occurrenceProbabilityAtSameTimestamp self.occurrenceProbabilityToSkipSubsequentTimestamp = occurrenceProbabilityToSkipSubsequentTimestamp - def getFileName(self) -> str: - """ - Returns the name of the output file. - - :return: Output file name. - """ - return self.outputFile - - def getDatabaseAsDataFrame(self) -> pd.DataFrame: - """ - Returns the database as a DataFrame. - - :return: pd.DataFrame containing the temporal database. - """ - return self.df - def performCoinFlip(self, probability: float) -> bool: """ Perform a coin flip with the given probability. @@ -159,23 +35,34 @@ def performCoinFlip(self, probability: float) -> bool: def tuning(self, array, sumRes) -> list: """ Tune the array to ensure that the sum of the values equals sumRes. - - :param array: List of values to be tuned. - :type array: list - :param sumRes: Target sum for the array values. - :type sumRes: int - :return: Tuned list of values. """ - values = np.random.randint(1, self.numItems, len(array)) + num_transactions = len(array) + # Initialize values randomly between 1 and numItems + values = np.random.randint(1, self.numItems + 1, num_transactions) + + # normalize values to ensure sum equals sumRes + values = values / np.sum(values) * sumRes + values = np.round(values).astype(int) + # Adjust values to ensure sum equals sumRes while np.sum(values) != sumRes: - if np.sum(values) > sumRes: - maxIndex = np.argmax(values) - values[maxIndex] -= 1 + current_sum = np.sum(values) + if current_sum > sumRes: + # Decrease the value of a random index + indices = np.where(values > 1)[0] + if len(indices) == 0: + raise ValueError("Cannot adjust values to meet sumRes") + idx = np.random.choice(indices) + values[idx] -= 1 else: - minIndex = np.argmin(values) - values[minIndex] += 1 - + # Increase the value of a random index + indices = np.where(values < self.numItems)[0] + if len(indices) == 0: + raise ValueError("Cannot adjust values to meet sumRes") + idx = np.random.choice(indices) + values[idx] += 1 + + # Assign adjusted values back to array for i in range(len(array)): array[i][1] = values[i] @@ -185,12 +72,17 @@ def create(self) -> None: """ Create the temporal database or DataFrame based on the specified type of file. """ - db = [] + start = time.time() + + self.db = [] lineSize = [] self.current_timestamp = 0 # Initialize current timestamp - for i in range(self.databaseSize): + sumRes = self.numOfTransactions * self.avgItemsPerTransaction # Total number of items + + for i in range(self.numOfTransactions): + # Determine the timestamp if self.performCoinFlip(self.occurrenceProbabilityAtSameTimestamp): timestamp = self.current_timestamp else: @@ -200,46 +92,108 @@ def create(self) -> None: self.current_timestamp += 1 timestamp = self.current_timestamp - db.append([timestamp]) - if self.performCoinFlip(self.percentage): - lineSize.append([i, 0]) + self.db.append([timestamp]) # Start the transaction with the timestamp - sumRes = self.databaseSize * self.avgItemsPerTransaction + lineSize.append([i, 0]) # Initialize lineSize with 0 for each transaction - self.tuning(lineSize, sumRes) + # Adjust lineSize to ensure sum of sizes equals sumRes + lineSize = self.tuning(lineSize, sumRes) + # For each transaction, generate items for i in range(len(lineSize)): - if lineSize[i][1] > self.numItems: + transaction_index = lineSize[i][0] + num_items = lineSize[i][1] + if num_items > self.numItems: raise ValueError( - "Error: Either increase numItems or decrease avgItemsPerTransaction or modify percentage") - line = np.random.choice(range(1, self.numItems + 1), lineSize[i][1], replace=False) - db[lineSize[i][0]].extend(line) + "Error: Either increase numItems or decrease avgLenOfTransactions or modify percentage") + items = np.random.choice(range(1, self.numItems + 1), num_items, replace=False) + self.db[transaction_index].extend(items) + + self._runTime = time.time() - start + process = psutil.Process(os.getpid()) + self._memoryUSS = process.memory_full_info().uss + self._memoryRSS = process.memory_info().rss + + def save(self, outputFile: str = None) -> None: + """ + Save the temporal database to the specified output file. + """ + if outputFile is not None: + self.outputFile = outputFile + else: + self.outputFile = "temporalDatabase.csv" - if self.typeOfFile == "database": - with open(self.outputFile, "w") as outFile: - for line in db: - outFile.write(self.sep.join(map(str, line)) + '\n') + with open(self.outputFile, 'w') as writer: + for line in self.db: + writer.write(self.sep.join(map(str, line)) + '\n') - if self.typeOfFile == "dataframe": - data = { - 'timestamp': [line[0] for line in db], - 'transactions': pd.Series([line[1:] for line in db]) - } - self.df = pd.DataFrame(data) + def getRuntime(self) -> float: + """ + Returns the runtime of the algorithm in seconds. + """ + return self._runTime - print("Temporal database created successfully") + def getMemoryRSS(self) -> int: + """ + """ + return self._memoryRSS + + def getMemoryUSS(self) -> int: + """ + """ + return self._memoryUSS + + def getTransactions(self) -> None: + """ + Convert the database to a DataFrame. + """ + # merge all the transactions into a single DataFrame + timestamps = [] + transactions = [] + + for line in self.db: + timestamps.append(line[0]) + transactions.append(line[1:]) + + self.df = pd.DataFrame([timestamps, transactions], index=['Timestamp', 'Items']).T + + return self.df if __name__ == '__main__': - if len(sys.argv) != 10: - print("Usage: python TemporalDatabase.py ") + if len(sys.argv) == 10: + obj = TemporalDatabase( + numOfTransactions=int(sys.argv[1]), + avgLenOfTransactions=int(sys.argv[2]), + numItems=int(sys.argv[3]), + outputFile=sys.argv[4], + percentage=int(sys.argv[5]), + sep=sys.argv[6], + typeOfFile=sys.argv[7], + occurrenceProbabilityAtSameTimestamp=float(sys.argv[8]), + occurrenceProbabilityToSkipSubsequentTimestamp=float(sys.argv[9]) + ) + obj.create() + obj.save() + else: + print( + "Usage: python TemporalDatabase.py ") + + obj = TemporalDatabase( + numOfTransactions=100000, + avgLenOfTransactions=10, + numItems=50, + sep="\t", + occurrenceProbabilityAtSameTimestamp=0.1, + occurrenceProbabilityToSkipSubsequentTimestamp=0.1 + ) + obj.create() + obj.save("temporalDatabase.txt") + + print(obj.getTransactions()) + sys.exit(1) - obj = TemporalDatabase( - int(sys.argv[1]), int(sys.argv[2]), int(sys.argv[3]), sys.argv[4], - percentage=int(sys.argv[5]), sep=sys.argv[6], typeOfFile=sys.argv[7], - occurrenceProbabilityAtSameTimestamp=float(sys.argv[8]), - occurrenceProbabilityToSkipSubsequentTimestamp=float(sys.argv[9]) - ) - obj.create() + +