From ebe7a8d301be1336892678696dfcff5f1e2dba2f Mon Sep 17 00:00:00 2001 From: Tarun Sreepada Date: Tue, 30 Apr 2024 17:48:37 +0900 Subject: [PATCH] Efficent Pattern Save / Dataframe export --- PAMI/frequentPattern/basic/Apriori.py | 41 ++++++++++------- PAMI/frequentPattern/basic/Aprioribitset.py | 49 ++++++++++++++------- PAMI/frequentPattern/basic/ECLAT.py | 2 + PAMI/frequentPattern/basic/ECLATbitset.py | 5 +-- PAMI/frequentPattern/basic/FPGrowth.py | 41 +++++++++-------- 5 files changed, 83 insertions(+), 55 deletions(-) diff --git a/PAMI/frequentPattern/basic/Apriori.py b/PAMI/frequentPattern/basic/Apriori.py index fea3fa42..631643a3 100644 --- a/PAMI/frequentPattern/basic/Apriori.py +++ b/PAMI/frequentPattern/basic/Apriori.py @@ -244,7 +244,8 @@ def mine(self) -> None: for key in items: if len(items[key]) >= self._minSup: cands.append(key) - self._finalPatterns["\t".join(key)] = len(items[key]) + # self._finalPatterns["\t".join(key)] = len(items[key]) + self._finalPatterns[key] = len(items[key]) fileData[key] = set(items[key]) else: break @@ -260,7 +261,7 @@ def mine(self) -> None: intersection = intersection.intersection(fileData[tuple([newCand[k]])]) if len(intersection) >= self._minSup: newKeys.append(newCand) - newCand = "\t".join(newCand) + # newCand = "\t".join(newCand) self._finalPatterns[newCand] = len(intersection) del cands cands = newKeys @@ -313,31 +314,41 @@ def getPatternsAsDataFrame(self) -> _ab._pd.DataFrame: Storing final frequent patterns in a dataframe :return: returning frequent patterns in a dataframe + :rtype: pd.DataFrame + """ - dataFrame = {} - data = [] - for a, b in self._finalPatterns.items(): - data.append([a.replace('\t', ' '), b]) - dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) - # dataFrame = dataFrame.replace(r'\r+|\n+|\t+',' ', regex=True) + # time = _ab._time.time() + # dataFrame = {} + # data = [] + # for a, b in self._finalPatterns.items(): + # # data.append([a.replace('\t', ' '), b]) + # data.append([" ".join(a), b]) + # dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) + # print("Time taken to convert the frequent patterns into DataFrame is: ", _ab._time.time() - time) + + + dataFrame = _ab._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support']) + return dataFrame - def save(self, outFile) -> None: + def save(self, outFile: str) -> None: """ - This function writes the final patterns into csv file. + Complete set of frequent patterns will be loaded in to an output file :param outFile: name of the output file + :type outFile: csvfile + :return: None + """ - self._oFile = outFile - writer = open(self._oFile, 'w+') - for x, y in self._finalPatterns.items(): - s1 = x.strip() + ":" + str(y) - writer.write("%s \n" % s1) + with open(outFile, 'w') as f: + for x, y in self._finalPatterns.items(): + x = self._sep.join(x) + f.write(f"{x} : {y}\n") def getPatterns(self) -> Dict[str, int]: """ diff --git a/PAMI/frequentPattern/basic/Aprioribitset.py b/PAMI/frequentPattern/basic/Aprioribitset.py index 9b36b39e..2304f71f 100644 --- a/PAMI/frequentPattern/basic/Aprioribitset.py +++ b/PAMI/frequentPattern/basic/Aprioribitset.py @@ -268,7 +268,7 @@ def mine(self) -> None: cands = [] for key in items: if len(items[key]) >= self._minSup: - self._finalPatterns["\t".join(key)] = len(items[key]) + self._finalPatterns[key] = len(items[key]) cands.append(key) items[key] = self._bitPacker(items[key], index) # print(key, items[key]) @@ -287,7 +287,6 @@ def mine(self) -> None: count = int.bit_count(intersection) if count >= self._minSup: newCands.append(newCand) - newCand = "\t".join(newCand) self._finalPatterns[newCand] = count else: break @@ -329,31 +328,47 @@ def getRuntime(self): return self._endTime - self._startTime - def getPatternsAsDataFrame(self): + def getPatternsAsDataFrame(self) -> _ab._pd.DataFrame: """ + Storing final frequent patterns in a dataframe + :return: returning frequent patterns in a dataframe + :rtype: pd.DataFrame + """ - dataFrame = {} - data = [] - for a, b in self._finalPatterns.items(): - data.append([a.replace('\t', ' '), b]) - dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) + # time = _ab._time.time() + # dataFrame = {} + # data = [] + # for a, b in self._finalPatterns.items(): + # # data.append([a.replace('\t', ' '), b]) + # data.append([" ".join(a), b]) + # dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) + # print("Time taken to convert the frequent patterns into DataFrame is: ", _ab._time.time() - time) + + + dataFrame = _ab._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support']) + return dataFrame - def save(self, outFile): + def save(self, outFile: str) -> None: """ + Complete set of frequent patterns will be loaded in to an output file - :param outFile: name of the outputfile - :type outFile: file + + :param outFile: name of the output file + + :type outFile: csvfile + + :return: None + """ - self._oFile = outFile - writer = open(self._oFile, 'w+') - for x, y in self._finalPatterns.items(): - patternsAndSupport = x.strip() + ":" + str(y) - writer.write("%s \n" % patternsAndSupport) + with open(outFile, 'w') as f: + for x, y in self._finalPatterns.items(): + x = self._sep.join(x) + f.write(f"{x} : {y}\n") def getPatterns(self): """ @@ -389,3 +404,5 @@ def printResults(self): print("Total ExecutionTime in ms:", _ap.getRuntime()) else: print("Error! The number of input parameters do not match the total number of parameters provided") + + diff --git a/PAMI/frequentPattern/basic/ECLAT.py b/PAMI/frequentPattern/basic/ECLAT.py index 50259765..fd1efaa8 100644 --- a/PAMI/frequentPattern/basic/ECLAT.py +++ b/PAMI/frequentPattern/basic/ECLAT.py @@ -398,3 +398,5 @@ def printResults(self) -> None: else: print("Error! The number of input parameters do not match the total number of parameters provided") + + diff --git a/PAMI/frequentPattern/basic/ECLATbitset.py b/PAMI/frequentPattern/basic/ECLATbitset.py index c208cd00..cddc968a 100644 --- a/PAMI/frequentPattern/basic/ECLATbitset.py +++ b/PAMI/frequentPattern/basic/ECLATbitset.py @@ -289,9 +289,6 @@ def mine(self) -> None: self.__recursive(items, cands) - - - self._endTime = _ab._time.time() process = _ab._psutil.Process(_ab._os.getpid()) self._memoryUSS = float() @@ -404,3 +401,5 @@ def printResults(self): print("Total ExecutionTime in ms:", _ap.getRuntime()) else: print("Error! The number of input parameters do not match the total number of parameters provided") + + diff --git a/PAMI/frequentPattern/basic/FPGrowth.py b/PAMI/frequentPattern/basic/FPGrowth.py index 7e9075ce..41110765 100644 --- a/PAMI/frequentPattern/basic/FPGrowth.py +++ b/PAMI/frequentPattern/basic/FPGrowth.py @@ -379,8 +379,9 @@ def _recursive(self, root, itemNode, minSup, patterns): break newRoot = _Node(root.item + [item], 0, None) - pat = "\t".join([str(i) for i in newRoot.item]) - self.__finalPatterns[pat] = itemNode[item][1] + # pat = "\t".join([str(i) for i in newRoot.item]) + # self.__finalPatterns[pat] = itemNode[item][1] + self._finalPatterns[tuple(newRoot.item)] = itemNode[item][1] newItemNode = {} if len(itemNode[item][0]) == 1: @@ -389,10 +390,10 @@ def _recursive(self, root, itemNode, minSup, patterns): continue combination = self._all_combinations(transaction) for comb in combination: - pat = "\t".join([str(i) for i in comb]) - pat = pat + "\t" + "\t".join([str(i) for i in newRoot.item]) - self.__finalPatterns[pat] = count - # self._finalPatterns[tuple(list(comb) + newRoot.item)] = count + # pat = "\t".join([str(i) for i in comb]) + # pat = pat + "\t" + "\t".join([str(i) for i in newRoot.item]) + # self.__finalPatterns[pat] = count + self._finalPatterns[tuple(list(comb) + newRoot.item)] = count pass @@ -504,6 +505,7 @@ def getRuntime(self) -> float: """ return self.__endTime - self.__startTime + def getPatternsAsDataFrame(self) -> _fp._pd.DataFrame: """ @@ -514,12 +516,13 @@ def getPatternsAsDataFrame(self) -> _fp._pd.DataFrame: :rtype: pd.DataFrame """ - dataframe = {} - data = [] - for a, b in self.__finalPatterns.items(): - data.append([a.replace('\t', ' '), b]) - dataframe = _fp._pd.DataFrame(data, columns=['Patterns', 'Support']) - return dataframe + # dataframe = {} + # data = [] + # for a, b in self.__finalPatterns.items(): + # data.append([a.replace('\t', ' '), b]) + # dataframe = _fp._pd.DataFrame(data, columns=['Patterns', 'Support']) + dataFrame = _fp._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support']) + return dataFrame def save(self, outFile: str) -> None: """ @@ -531,11 +534,10 @@ def save(self, outFile: str) -> None: :return: None """ - self._oFile = outFile - writer = open(self._oFile, 'w+') - for x, y in self.__finalPatterns.items(): - s1 = x.strip() + ":" + str(y) - writer.write("%s \n" % s1) + with open(outFile, 'w') as f: + for x, y in self._finalPatterns.items(): + x = self._sep.join(x) + f.write(f"{x} : {y}\n") def getPatterns(self) -> Dict[str, int]: """ @@ -543,7 +545,7 @@ def getPatterns(self) -> Dict[str, int]: :return: returning frequent patterns :rtype: dict """ - return self.__finalPatterns + return self._finalPatterns def printResults(self) -> None: """ @@ -571,6 +573,3 @@ def printResults(self) -> None: print("Total ExecutionTime in ms:", _ap.getRuntime()) else: print("Error! The number of input parameters do not match the total number of parameters provided") - - - \ No newline at end of file