diff --git a/PAMI/extras/syntheticDataGenerator/GeoReferentialTransactionalDatabase.py b/PAMI/extras/syntheticDataGenerator/GeoReferentialTransactionalDatabase.py index 0cdec719..387282e5 100644 --- a/PAMI/extras/syntheticDataGenerator/GeoReferentialTransactionalDatabase.py +++ b/PAMI/extras/syntheticDataGenerator/GeoReferentialTransactionalDatabase.py @@ -1,7 +1,7 @@ # generateTransactionalDatabase is a code used to convert the database into Temporal database. # # **Importing this algorithm into a python program** -# -------------------------------------------------------- +# # from PAMI.extras.generateDatabase import generateTransactionalDatabase as db # obj = db(10, 5, 10) # obj.create() @@ -9,7 +9,7 @@ # print(obj.getTransactions()) to get the transactional database as a pandas dataframe # **Running the code from the command line** -# -------------------------------------------------------- +# # python generateDatabase.py 10 5 10 db.txt # cat db.txt # @@ -121,16 +121,15 @@ def tuning(self, array, sumRes) -> list: """ while np.sum(array) != sumRes: - # get index of largest value - randIndex = np.random.randint(0, len(array)) - # if sum is too large, decrease the largest value + if np.sum(array) > sumRes: - array[randIndex] -= 1 - # if sum is too small, increase the smallest value + maxIndex = np.argmax(array) + array[maxIndex] -= 1 + # if sum is too small, increase the smallest value else: minIndex = np.argmin(array) - array[randIndex] += 1 - return array + array[minIndex] += 1 + return array def generateArray(self, nums, avg, maxItems) -> list: """ @@ -154,7 +153,7 @@ def generateArray(self, nums, avg, maxItems) -> list: """ # generate n random values - values = np.random.randint(1, maxItems, nums) + values = np.random.randint(1, avg * 1.5, nums) sumRes = nums * avg diff --git a/PAMI/extras/syntheticDataGenerator/GeoreferentialTemporalDatabase.py b/PAMI/extras/syntheticDataGenerator/GeoreferentialTemporalDatabase.py index 77151278..d55c7b39 100644 --- a/PAMI/extras/syntheticDataGenerator/GeoreferentialTemporalDatabase.py +++ b/PAMI/extras/syntheticDataGenerator/GeoreferentialTemporalDatabase.py @@ -57,6 +57,7 @@ def __init__( self.seperator = sep self.occurrenceProbabilityOfSameTimestamp = occurrenceProbabilityOfSameTimestamp self.occurrenceProbabilityToSkipSubsequentTimestamp = occurrenceProbabilityToSkipSubsequentTimestamp + self.current_timestamp = int() self._startTime = float() self._endTime = float() self._memoryUSS = float() @@ -76,7 +77,7 @@ def __init__( def getPoint(self, x1, y1, x2, y2): - return (np.random.randint(x1, x2), np.random.randint(y1, y2)) + return (np.random.randint(x1, x2),np.random.randint(y1, y2)) def performCoinFlip(self, probability: float) -> bool: """ @@ -86,7 +87,7 @@ def performCoinFlip(self, probability: float) -> bool: :return: True if the coin lands heads, False otherwise. """ result = np.random.choice([0, 1], p=[1 - probability, probability]) - return result == 1 + return result def tuning(self, array, sumRes) -> list: """ @@ -106,16 +107,14 @@ def tuning(self, array, sumRes) -> list: """ while np.sum(array) != sumRes: - # get index of largest value - randIndex = np.random.randint(0, len(array)) - # if sum is too large, decrease the largest value if np.sum(array) > sumRes: - array[randIndex] -= 1 - # if sum is too small, increase the smallest value + maxIndex = np.argmax(array) + array[maxIndex] -= 1 + # if sum is too small, increase the smallest value else: minIndex = np.argmin(array) - array[randIndex] += 1 - return array + array[minIndex] += 1 + return array def generateArray(self, nums, avg, maxItems) -> list: """ @@ -139,7 +138,7 @@ def generateArray(self, nums, avg, maxItems) -> list: """ # generate n random values - values = np.random.randint(1, maxItems, nums) + values = np.random.randint(1, avg*1.5, nums) sumRes = nums * avg @@ -172,15 +171,15 @@ def create(self) -> None: """ self._startTime = time.time() db = set() - lineSize = [] #may be error. need to check it. - sumRes = self.databaseSize * self.avgItemsPerTransaction # Total number of items + + values = self.generateArray(self.databaseSize, self.avgItemsPerTransaction, self.numItems) for i in range(self.databaseSize): # Determine the timestamp if self.performCoinFlip(self.occurrenceProbabilityOfSameTimestamp): timestamp = self.current_timestamp else: - if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp): + if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp)==1: self.current_timestamp += 2 else: self.current_timestamp += 1 @@ -188,23 +187,13 @@ def create(self) -> None: self.db.append([timestamp]) # Start the transaction with the timestamp - lineSize.append([i, 0]) # Initialize lineSize with 0 for each transaction - - # Adjust lineSize to ensure sum of sizes equals sumRes - lineSize = self.tuning(lineSize, sumRes) - # For each transaction, generate items - for i in tqdm.tqdm(range(len(lineSize))): - transaction_index = lineSize[i][0] - num_items = lineSize[i][1] - - if num_items > self.numItems: - raise ValueError( - "Error: Either increase numItems or decrease avgItemsPerTransaction or modify percentage") - items = np.random.choice(range(1, self.numItems + 1), num_items, replace=False) - self.db[transaction_index].extend(items) + for i in tqdm.tqdm(range(self.databaseSize)): + items = np.random.choice(range(1, self.numItems + 1), values[i], replace=False) + nline = [self.itemPoint[i] for i in items] + self.db[i].extend(nline) - self._runTime = time.time() - self._startTime + self._endTime = time.time() process = psutil.Process(os.getpid()) self._memoryUSS = process.memory_full_info().uss self._memoryRSS = process.memory_info().rss diff --git a/PAMI/extras/syntheticDataGenerator/utilityDatabase.py b/PAMI/extras/syntheticDataGenerator/UtilityDatabase.py similarity index 60% rename from PAMI/extras/syntheticDataGenerator/utilityDatabase.py rename to PAMI/extras/syntheticDataGenerator/UtilityDatabase.py index ef88ea64..94888b5b 100644 --- a/PAMI/extras/syntheticDataGenerator/utilityDatabase.py +++ b/PAMI/extras/syntheticDataGenerator/UtilityDatabase.py @@ -1,43 +1,65 @@ import numpy as np import pandas as pd import random +import psutil, os, time -class UtilityDataGenerator: - def __init__(self, databaseSize, numberOfItems, averageLengthOfTransaction, - minimumInternalUtilityValue, maximumInternalUtilityValue, - minimumExternalUtilityValue, maximumExternalUtilityValue): +class UtilityDatabase: + def __init__(self, databaseSize, numItems, avgItemsPerTransaction, + minInternalUtilityValue, maxInternalUtilityValue, + minExternalUtilityValue, maxExternalUtilityValue): self.databaseSize = databaseSize - self.numberOfItems = numberOfItems - self.averageLengthOfTransaction = averageLengthOfTransaction - self.minInternalUtilityValue = minimumInternalUtilityValue - self.maxInternalUtilityValue = maximumInternalUtilityValue - self.minExternalUtilityValue = minimumExternalUtilityValue - self.maxExternalUtilityValue = maximumExternalUtilityValue + self.numItems = numItems + self.avgItemsPerTransaction = avgItemsPerTransaction + self.minInternalUtilityValue = minInternalUtilityValue + self.maxInternalUtilityValue = maxInternalUtilityValue + self.minExternalUtilityValue = minExternalUtilityValue + self.maxExternalUtilityValue = maxExternalUtilityValue self.entries = [] self.ExternalUtilityData = self.GenerateExternalUtilityData() + self._startTime = float() + self._endTime = float() + self._memoryUSS = float() + self._memoryRSS = float() def GenerateExternalUtilityData(self): - items = range(1, self.numberOfItems + 1) + items = range(1, self.numItems + 1) ExternalUtilityData = {f'item{item}': random.randint(100, 900) for item in items} return ExternalUtilityData - def Generate(self): + def create(self): + self._startTime = time.time() for entry_id in range(1, self.databaseSize + 1): - entry_length = np.random.randint(1, self.averageLengthOfTransaction * 2) + entry_length = np.random.randint(1, self.avgItemsPerTransaction * 2) entry = np.random.randint(self.minInternalUtilityValue, self.maxInternalUtilityValue + 1, - size=self.numberOfItems) + size=self.numItems) entry_sum = entry.sum() self.entries.append((entry, entry_sum)) + self._endTime = time.time() - def Save(self, fileName): + def save(self, fileName): with open(fileName, 'w') as file: for idx, (entry, entry_sum) in enumerate(self.entries, start=1): entry_str = '\t'.join(map(str, entry)) file.write(f'{idx}\t{entry_str}\t{entry_sum}\n') + def getMemoryUSS(self) -> float: + + process = psutil.Process(os.getpid()) + self._memoryUSS = process.memory_full_info().uss + return self._memoryUSS + + def getMemoryRSS(self) -> float: + + process = psutil.Process(os.getpid()) + self._memoryRSS = process.memory_info().rss + return self._memoryRSS + + def getRuntime(self) -> float: + return self._endTime - self._startTime + def SaveItemsInternalUtilityValues(self, fileName): - items = random.sample(range(1, self.numberOfItems + 1), self.numberOfItems) + items = random.sample(range(1, self.numItems + 1), self.numItems) internal_utility_data = [np.random.randint(self.minInternalUtilityValue, self.maxInternalUtilityValue + 1) for _ in items] data = {'Item': items, 'Internal Utility Value': internal_utility_data} @@ -45,7 +67,7 @@ def SaveItemsInternalUtilityValues(self, fileName): df.to_csv(fileName, sep='\t', index=False) def Saveitemsexternalutilityvalues(self, fileName): - items = random.sample(range(1, self.numberOfItems + 1), self.numberOfItems) + items = random.sample(range(1, self.numItems + 1), self.numItems) data = {'Item': [f'item{item}' for item in items], 'External Utility Value': list(self.ExternalUtilityData.values())} df = pd.DataFrame(data) @@ -59,7 +81,7 @@ def GetUtilityData(self): return df def GetInternalUtilityData(self): - items = random.sample(range(1, self.numberOfItems + 1), self.numberOfItems) + items = random.sample(range(1, self.numItems + 1), self.numItems) InternalUtilityData = [np.random.randint(self.minInternalUtilityValue, self.maxInternalUtilityValue + 1) for _ in items] data = {'Item': items, 'Internal Utility Value': InternalUtilityData} @@ -67,14 +89,14 @@ def GetInternalUtilityData(self): return df def GetExternalUtilityData(self): - items = random.sample(range(1, self.numberOfItems + 1), self.numberOfItems) + items = random.sample(range(1, self.numItems + 1), self.numItems) data = {'Item': [f'item{item}' for item in items], 'External Utility Value': list(self.ExternalUtilityData.values())} df = pd.DataFrame(data) return df def GenerateAndPrintItemPairs(self): - items = random.sample(range(1, self.numberOfItems + 1), 2) + items = random.sample(range(1, self.numItems + 1), 2) item1_id = f'item{items[0]}' item2_id = f'item{items[1]}' item1_value = self.ExternalUtilityData[item1_id] @@ -87,12 +109,15 @@ def GenerateAndPrintItemPairs(self): if __name__ == "__main__": - data_generator = UtilityDataGenerator(100000, 2000, 10, 1, 100, 1, 10) - data_generator.Generate() - data_generator.Save("utility_data-6.csv") + data_generator = UtilityDatabase(100000, 2000, 10, 1, 100, 1, 10) + data_generator.create() + data_generator.save("utility_data-6.csv") data_generator.SaveItemsInternalUtilityValues("items_internal_utility.csv") data_generator.Saveitemsexternalutilityvalues("items_external_utility.csv") - utility_data = data_generator.GetUtilityData() + utilityDataFrame = data_generator.GetUtilityData() + print('Runtime: ' + str(data_generator.getRuntime())) + print('Memory (RSS): ' + str(data_generator.getMemoryRSS())) + print('Memory (USS): ' + str(data_generator.getMemoryUSS())) InternalUtilityData = data_generator.GetInternalUtilityData() ExternalUtilityData = data_generator.GetExternalUtilityData()