Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

minor optimization #414

Merged
merged 3 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 69 additions & 74 deletions PAMI/frequentPattern/basic/ECLAT.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,21 @@ class ECLAT(_ab._frequentPatterns):
About this algorithm
====================

:**Description**: *ECLAT is one of the fundamental algorithm to discover frequent patterns in a transactional database.*
:**Description**: ECLAT is one of the fundamental algorithm to discover frequent patterns in a transactional database.

:**Reference**: Mohammed Javeed Zaki: Scalable Algorithms for Association Mining. IEEE Trans. Knowl. Data Eng. 12(3):
372-390 (2000), https://ieeexplore.ieee.org/document/846291
372-390 (2000), https://ieeexplore.ieee.org/document/846291

:**Parameters**: - **iFile** (*str or URL or dataFrame*) -- *Name of the Input file to mine complete set of frequent patterns.*
- **oFile** (*str*) -- *Name of the Output file to store the frequent patterns.*
- **minSup** (*int or float or str*) -- The user can specify minSup either in count or proportion of database size. If the program detects the data type of minSup is integer, then it treats minSup is expressed in count.
- **sep** (*str*) -- This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.
- **oFile** (*str*) -- *Name of the output file to store complete set of frequent patterns.*
- **minSup** (*int or float or str*) -- *The user can specify minSup either in count or proportion of database size. If the program detects the data type of minSup is integer, then it treats minSup is expressed in count. Otherwise, it will be treated as float.*
- **sep** (*str*) -- *This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.*

:**Attributes**: - **startTime** (*float*) -- *To record the start time of the mining process.*
- **endTime** (*float*) -- *To record the end time of the mining process.*
- **endTime** (*float*) -- *To record the completion time of the mining process.*
- **finalPatterns** (*dict*) -- *Storing the complete set of patterns in a dictionary variable.*
- **memoryUSS** (*float*) -- *To store the total amount of USS memory consumed by the program.*
- **memoryRSS** *(float*) -- *To store the total amount of RSS memory consumed by the program.*
- **memoryRSS** (*float*) -- *To store the total amount of RSS memory consumed by the program.*
- **Database** (*list*) -- *To store the transactions of a database in list.*

Execution methods
Expand Down Expand Up @@ -183,58 +183,6 @@ def _creatingItemSets(self) -> float:
print("File Not Found")
quit()

def _getUniqueItemList(self) -> list:
"""

Generating one frequent patterns

:return: list of unique patterns
:rtype: list
"""
self._finalPatterns = {}
candidate = {}
uniqueItem = []
for i in range(len(self._Database)):
for j in range(len(self._Database[i])):
if self._Database[i][j] not in candidate:
candidate[self._Database[i][j]] = {i}
else:
candidate[self._Database[i][j]].add(i)
for key, value in candidate.items():
supp = len(value)
if supp >= self._minSup:
self._finalPatterns[key] = [value]
uniqueItem.append(key)
uniqueItem.sort()
return uniqueItem

def _generateFrequentPatterns(self, candidateFrequent: list) -> None:
"""

It will generate the combinations of frequent items

:param candidateFrequent :it represents the items with their respective transaction identifiers
:type candidateFrequent: list
:return: None
"""
new_freqList = []
for i in range(0, len(candidateFrequent)):
item1 = candidateFrequent[i]
i1_list = item1.split()
for j in range(i + 1, len(candidateFrequent)):
item2 = candidateFrequent[j]
i2_list = item2.split()
if i1_list[:-1] == i2_list[:-1]:
interSet = self._finalPatterns[item1][0].intersection(self._finalPatterns[item2][0])
if len(interSet) >= self._minSup:
newKey = item1 + "\t" + i2_list[-1]
self._finalPatterns[newKey] = [interSet]
new_freqList.append(newKey)
else: break

if len(new_freqList) > 0:
self._generateFrequentPatterns(new_freqList)

def _convert(self, value) -> float:
"""

Expand Down Expand Up @@ -264,6 +212,30 @@ def startMine(self) -> None:

self.mine()

def __recursive(self, items, cands):
"""

This function generates new candidates by taking input as original candidates.

:param items: A dictionary containing items and their corresponding support values.
:type items: dict
:param cands: A list of candidate itemsets.
:type cands: list
:return: None
"""

for i in range(len(cands)):
newCands = []
for j in range(i + 1, len(cands)):
intersection = items[cands[i]].intersection(items[cands[j]])
if len(intersection) >= self._minSup:
newCand = tuple(cands[i] + tuple([cands[j][-1]]))
newCands.append(newCand)
items[newCand] = intersection
self._finalPatterns[newCand] = len(intersection)
if len(newCands) > 1:
self.__recursive(items, newCands)

def mine(self) -> None:
"""
Frequent pattern mining process will start from here
Expand All @@ -275,11 +247,29 @@ def mine(self) -> None:
if self._minSup is None:
raise Exception("Please enter the Minimum Support")
self._creatingItemSets()

self._minSup = self._convert(self._minSup)
uniqueItemList = self._getUniqueItemList()
self._generateFrequentPatterns(uniqueItemList)
for x, y in self._finalPatterns.items():
self._finalPatterns[x] = len(y[0])


items = {}
index = 0
for line in self._Database:
for item in line:
if item not in items:
items[item] = []
items[item].append(index)
index += 1

items = {tuple([k]): set(v) for k, v in items.items() if len(v) >= self._minSup}
items = {k: v for k, v in sorted(items.items(), key=lambda item: len(item[1]), reverse=False)}
for k, v in items.items():
self._finalPatterns[k] = len(v)

cands = list(items.keys())

self.__recursive(items, cands)


self._endTime = _ab._time.time()
process = _ab._psutil.Process(_ab._os.getpid())
self._memoryUSS = float()
Expand Down Expand Up @@ -329,11 +319,18 @@ def getPatternsAsDataFrame(self) -> _ab._pd.DataFrame:
:rtype: pd.DataFrame
"""

dataFrame = {}
data = []
for a, b in self._finalPatterns.items():
data.append([a.replace('\t', ' '), b])
dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support'])
# time = _ab._time.time()
# dataFrame = {}
# data = []
# for a, b in self._finalPatterns.items():
# # data.append([a.replace('\t', ' '), b])
# data.append([" ".join(a), b])
# dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support'])
# print("Time taken to convert the frequent patterns into DataFrame is: ", _ab._time.time() - time)


dataFrame = _ab._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support'])

return dataFrame

def save(self, outFile: str) -> None:
Expand All @@ -345,15 +342,13 @@ def save(self, outFile: str) -> None:
:type outFile: csvfile
:return: None
"""
self._oFile = outFile
writer = open(self._oFile, 'w+')
for x, y in self._finalPatterns.items():
patternsAndSupport = x.strip() + ":" + str(y)
writer.write("%s \n" % patternsAndSupport)
with open(outFile, 'w') as f:
for x, y in self._finalPatterns.items():
x = self._sep.join(x)
f.write(f"{x}:{y}\n")

def getPatterns(self) -> dict:
"""

Function to send the set of frequent patterns after completion of the mining process

:return: returning frequent patterns
Expand Down
116 changes: 52 additions & 64 deletions PAMI/frequentPattern/basic/ECLATDiffset.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,72 +203,38 @@ def _convert(self, value):
value = int(value)
return value

def _getUniqueItemList(self):

# tidSets will store all the initial tids
tidSets = {}
# uniqueItem will store all frequent 1 items
uniqueItem = []
for line in self._Database:
transNum = 0
# Database = [set([i.rstrip() for i in transaction.split('\t')]) for transaction in f]
for transaction in self._Database:
transNum += 1
self._trans_set.add(transNum)
for item in transaction:
if item in tidSets:
tidSets[item].add(transNum)
else:
tidSets[item] = {transNum}
for key, value in tidSets.items():
supp = len(value)
if supp >= self._minSup:
self._diffSets[key] = [supp, self._trans_set.difference(value)]
uniqueItem.append(key)
# for x, y in self._diffSets.items():
# print(x, y)
uniqueItem.sort()
# print()
return uniqueItem

def _runDeclat(self, candidateList):
"""

It will generate the combinations of frequent items

:param candidateList :it represents the items with their respective transaction identifiers
:type candidateList: list
:return: returning transaction dictionary
:rtype: dict
"""

newList = []
for i in range(0, len(candidateList)):
item1 = candidateList[i]
iList = item1.split()
for j in range(i + 1, len(candidateList)):
item2 = candidateList[j]
jList = item2.split()
if iList[:-1] == jList[:-1]:
unionDiffSet = self._diffSets[item2][1].difference(self._diffSets[item1][1])
unionSup = self._diffSets[item1][0] - len(unionDiffSet)
if unionSup >= self._minSup:
newKey = item1 + "\t" + jList[-1]
self._diffSets[newKey] = [unionSup, unionDiffSet]
newList.append(newKey)
else:
break

if len(newList) > 0:
self._runDeclat(newList)

@deprecated("It is recommended to use 'mine()' instead of 'startMine()' for mining process. Starting from January 2025, 'startMine()' will be completely terminated.")
def startMine(self):
"""
Frequent pattern mining process will start from here
"""
self.mine()

def __recursive(self, items, cands):
"""

This function generates new candidates by taking input as original candidates.

:param items: A dictionary containing items and their corresponding support values.
:type items: dict
:param cands: A list of candidate itemsets.
:type cands: list
:return: None
"""

for i in range(len(cands)):
newCands = []
for j in range(i + 1, len(cands)):
intersection = items[cands[i]] | items[cands[j]]
supp = len(self._db - intersection)
if supp >= self._minSup:
newCand = tuple(cands[i] + tuple([cands[j][-1]]))
newCands.append(newCand)
items[newCand] = intersection
self._finalPatterns[newCand] = supp
if len(newCands) > 1:
self.__recursive(items, newCands)

def mine(self):
"""
Frequent pattern mining process will start from here
Expand All @@ -286,11 +252,33 @@ def mine(self):
self._creatingItemSets()
#print(len(self._Database))
self._minSup = self._convert(self._minSup)
uniqueItemList = []
uniqueItemList = self._getUniqueItemList()
self._runDeclat(uniqueItemList)
self._finalPatterns = self._diffSets
#print(len(self._finalPatterns), len(uniqueItemList))

items = {}
db = set([i for i in range(len(self._Database))])
for i in range(len(self._Database)):
for item in self._Database[i]:
if tuple([item]) in items:
items[tuple([item])].append(i)
else:
items[tuple([item])] = [i]

items = dict(sorted(items.items(), key=lambda x: len(x[1]), reverse=True))

keys = []
for item in list(items.keys()):
if len(items[item]) < self._minSup:
del items[item]
continue
self._finalPatterns[item] = len(items[item])
# print(item, len(items[item]))
items[item] = db - set(items[item])
# print(item, len(items[item]))
keys.append(item)

self._db = db

self.__recursive(items, keys)

self._endTime = _ab._time.time()
process = _ab._psutil.Process(_ab._os.getpid())
self._memoryUSS = float()
Expand Down
Loading