Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#2 updated the UtilityDatabase.py #561

Merged
merged 3 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# generateTransactionalDatabase is a code used to convert the database into Temporal database.
#
# **Importing this algorithm into a python program**
# --------------------------------------------------------
#
# from PAMI.extras.generateDatabase import generateTransactionalDatabase as db
# obj = db(10, 5, 10)
# obj.create()
# obj.save('db.txt')
# print(obj.getTransactions()) to get the transactional database as a pandas dataframe

# **Running the code from the command line**
# --------------------------------------------------------
#
# python generateDatabase.py 10 5 10 db.txt
# cat db.txt
#
Expand Down Expand Up @@ -121,16 +121,15 @@ def tuning(self, array, sumRes) -> list:
"""

while np.sum(array) != sumRes:
# get index of largest value
randIndex = np.random.randint(0, len(array))
# if sum is too large, decrease the largest value

if np.sum(array) > sumRes:
array[randIndex] -= 1
# if sum is too small, increase the smallest value
maxIndex = np.argmax(array)
array[maxIndex] -= 1
# if sum is too small, increase the smallest value
else:
minIndex = np.argmin(array)
array[randIndex] += 1
return array
array[minIndex] += 1
return array

def generateArray(self, nums, avg, maxItems) -> list:
"""
Expand All @@ -154,7 +153,7 @@ def generateArray(self, nums, avg, maxItems) -> list:
"""

# generate n random values
values = np.random.randint(1, maxItems, nums)
values = np.random.randint(1, avg * 1.5, nums)

sumRes = nums * avg

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
self.seperator = sep
self.occurrenceProbabilityOfSameTimestamp = occurrenceProbabilityOfSameTimestamp
self.occurrenceProbabilityToSkipSubsequentTimestamp = occurrenceProbabilityToSkipSubsequentTimestamp
self.current_timestamp = int()
self._startTime = float()
self._endTime = float()
self._memoryUSS = float()
Expand All @@ -76,7 +77,7 @@ def __init__(

def getPoint(self, x1, y1, x2, y2):

return (np.random.randint(x1, x2), np.random.randint(y1, y2))
return (np.random.randint(x1, x2),np.random.randint(y1, y2))

def performCoinFlip(self, probability: float) -> bool:
"""
Expand All @@ -86,7 +87,7 @@ def performCoinFlip(self, probability: float) -> bool:
:return: True if the coin lands heads, False otherwise.
"""
result = np.random.choice([0, 1], p=[1 - probability, probability])
return result == 1
return result

def tuning(self, array, sumRes) -> list:
"""
Expand All @@ -106,16 +107,14 @@ def tuning(self, array, sumRes) -> list:
"""

while np.sum(array) != sumRes:
# get index of largest value
randIndex = np.random.randint(0, len(array))
# if sum is too large, decrease the largest value
if np.sum(array) > sumRes:
array[randIndex] -= 1
# if sum is too small, increase the smallest value
maxIndex = np.argmax(array)
array[maxIndex] -= 1
# if sum is too small, increase the smallest value
else:
minIndex = np.argmin(array)
array[randIndex] += 1
return array
array[minIndex] += 1
return array

def generateArray(self, nums, avg, maxItems) -> list:
"""
Expand All @@ -139,7 +138,7 @@ def generateArray(self, nums, avg, maxItems) -> list:
"""

# generate n random values
values = np.random.randint(1, maxItems, nums)
values = np.random.randint(1, avg*1.5, nums)

sumRes = nums * avg

Expand Down Expand Up @@ -172,39 +171,29 @@ def create(self) -> None:
"""
self._startTime = time.time()
db = set()
lineSize = [] #may be error. need to check it.
sumRes = self.databaseSize * self.avgItemsPerTransaction # Total number of items

values = self.generateArray(self.databaseSize, self.avgItemsPerTransaction, self.numItems)

for i in range(self.databaseSize):
# Determine the timestamp
if self.performCoinFlip(self.occurrenceProbabilityOfSameTimestamp):
timestamp = self.current_timestamp
else:
if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp):
if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp)==1:
self.current_timestamp += 2
else:
self.current_timestamp += 1
timestamp = self.current_timestamp

self.db.append([timestamp]) # Start the transaction with the timestamp

lineSize.append([i, 0]) # Initialize lineSize with 0 for each transaction

# Adjust lineSize to ensure sum of sizes equals sumRes
lineSize = self.tuning(lineSize, sumRes)

# For each transaction, generate items
for i in tqdm.tqdm(range(len(lineSize))):
transaction_index = lineSize[i][0]
num_items = lineSize[i][1]

if num_items > self.numItems:
raise ValueError(
"Error: Either increase numItems or decrease avgItemsPerTransaction or modify percentage")
items = np.random.choice(range(1, self.numItems + 1), num_items, replace=False)
self.db[transaction_index].extend(items)
for i in tqdm.tqdm(range(self.databaseSize)):
items = np.random.choice(range(1, self.numItems + 1), values[i], replace=False)
nline = [self.itemPoint[i] for i in items]
self.db[i].extend(nline)

self._runTime = time.time() - self._startTime
self._endTime = time.time()
process = psutil.Process(os.getpid())
self._memoryUSS = process.memory_full_info().uss
self._memoryRSS = process.memory_info().rss
Expand Down
Original file line number Diff line number Diff line change
@@ -1,51 +1,73 @@
import numpy as np
import pandas as pd
import random
import psutil, os, time


class UtilityDataGenerator:
def __init__(self, databaseSize, numberOfItems, averageLengthOfTransaction,
minimumInternalUtilityValue, maximumInternalUtilityValue,
minimumExternalUtilityValue, maximumExternalUtilityValue):
class UtilityDatabase:
def __init__(self, databaseSize, numItems, avgItemsPerTransaction,
minInternalUtilityValue, maxInternalUtilityValue,
minExternalUtilityValue, maxExternalUtilityValue):
self.databaseSize = databaseSize
self.numberOfItems = numberOfItems
self.averageLengthOfTransaction = averageLengthOfTransaction
self.minInternalUtilityValue = minimumInternalUtilityValue
self.maxInternalUtilityValue = maximumInternalUtilityValue
self.minExternalUtilityValue = minimumExternalUtilityValue
self.maxExternalUtilityValue = maximumExternalUtilityValue
self.numItems = numItems
self.avgItemsPerTransaction = avgItemsPerTransaction
self.minInternalUtilityValue = minInternalUtilityValue
self.maxInternalUtilityValue = maxInternalUtilityValue
self.minExternalUtilityValue = minExternalUtilityValue
self.maxExternalUtilityValue = maxExternalUtilityValue
self.entries = []
self.ExternalUtilityData = self.GenerateExternalUtilityData()
self._startTime = float()
self._endTime = float()
self._memoryUSS = float()
self._memoryRSS = float()

def GenerateExternalUtilityData(self):
items = range(1, self.numberOfItems + 1)
items = range(1, self.numItems + 1)
ExternalUtilityData = {f'item{item}': random.randint(100, 900) for item in items}
return ExternalUtilityData

def Generate(self):
def create(self):
self._startTime = time.time()
for entry_id in range(1, self.databaseSize + 1):
entry_length = np.random.randint(1, self.averageLengthOfTransaction * 2)
entry_length = np.random.randint(1, self.avgItemsPerTransaction * 2)
entry = np.random.randint(self.minInternalUtilityValue, self.maxInternalUtilityValue + 1,
size=self.numberOfItems)
size=self.numItems)
entry_sum = entry.sum()
self.entries.append((entry, entry_sum))
self._endTime = time.time()

def Save(self, fileName):
def save(self, fileName):
with open(fileName, 'w') as file:
for idx, (entry, entry_sum) in enumerate(self.entries, start=1):
entry_str = '\t'.join(map(str, entry))
file.write(f'{idx}\t{entry_str}\t{entry_sum}\n')

def getMemoryUSS(self) -> float:

process = psutil.Process(os.getpid())
self._memoryUSS = process.memory_full_info().uss
return self._memoryUSS

def getMemoryRSS(self) -> float:

process = psutil.Process(os.getpid())
self._memoryRSS = process.memory_info().rss
return self._memoryRSS

def getRuntime(self) -> float:
return self._endTime - self._startTime

def SaveItemsInternalUtilityValues(self, fileName):
items = random.sample(range(1, self.numberOfItems + 1), self.numberOfItems)
items = random.sample(range(1, self.numItems + 1), self.numItems)
internal_utility_data = [np.random.randint(self.minInternalUtilityValue, self.maxInternalUtilityValue + 1) for _
in items]
data = {'Item': items, 'Internal Utility Value': internal_utility_data}
df = pd.DataFrame(data)
df.to_csv(fileName, sep='\t', index=False)

def Saveitemsexternalutilityvalues(self, fileName):
items = random.sample(range(1, self.numberOfItems + 1), self.numberOfItems)
items = random.sample(range(1, self.numItems + 1), self.numItems)
data = {'Item': [f'item{item}' for item in items],
'External Utility Value': list(self.ExternalUtilityData.values())}
df = pd.DataFrame(data)
Expand All @@ -59,22 +81,22 @@ def GetUtilityData(self):
return df

def GetInternalUtilityData(self):
items = random.sample(range(1, self.numberOfItems + 1), self.numberOfItems)
items = random.sample(range(1, self.numItems + 1), self.numItems)
InternalUtilityData = [np.random.randint(self.minInternalUtilityValue, self.maxInternalUtilityValue + 1) for _
in items]
data = {'Item': items, 'Internal Utility Value': InternalUtilityData}
df = pd.DataFrame(data)
return df

def GetExternalUtilityData(self):
items = random.sample(range(1, self.numberOfItems + 1), self.numberOfItems)
items = random.sample(range(1, self.numItems + 1), self.numItems)
data = {'Item': [f'item{item}' for item in items],
'External Utility Value': list(self.ExternalUtilityData.values())}
df = pd.DataFrame(data)
return df

def GenerateAndPrintItemPairs(self):
items = random.sample(range(1, self.numberOfItems + 1), 2)
items = random.sample(range(1, self.numItems + 1), 2)
item1_id = f'item{items[0]}'
item2_id = f'item{items[1]}'
item1_value = self.ExternalUtilityData[item1_id]
Expand All @@ -87,12 +109,15 @@ def GenerateAndPrintItemPairs(self):


if __name__ == "__main__":
data_generator = UtilityDataGenerator(100000, 2000, 10, 1, 100, 1, 10)
data_generator.Generate()
data_generator.Save("utility_data-6.csv")
data_generator = UtilityDatabase(100000, 2000, 10, 1, 100, 1, 10)
data_generator.create()
data_generator.save("utility_data-6.csv")
data_generator.SaveItemsInternalUtilityValues("items_internal_utility.csv")
data_generator.Saveitemsexternalutilityvalues("items_external_utility.csv")
utility_data = data_generator.GetUtilityData()
utilityDataFrame = data_generator.GetUtilityData()
print('Runtime: ' + str(data_generator.getRuntime()))
print('Memory (RSS): ' + str(data_generator.getMemoryRSS()))
print('Memory (USS): ' + str(data_generator.getMemoryUSS()))
InternalUtilityData = data_generator.GetInternalUtilityData()
ExternalUtilityData = data_generator.GetExternalUtilityData()

Expand Down