diff --git a/PAMI/AssociationRules/basic/abstract.py b/PAMI/AssociationRules/basic/abstract.py index 2c027bb2..62e4495c 100644 --- a/PAMI/AssociationRules/basic/abstract.py +++ b/PAMI/AssociationRules/basic/abstract.py @@ -110,7 +110,7 @@ def __init__(self, iFile, minConf, sep="\t"): self._iFile = iFile self._sep = sep self._minConf = minConf - self._finalPatterns = {} + self._associationRules = {} self._oFile = str() self._memoryUSS = float() self._memoryRSS = float() @@ -126,7 +126,7 @@ def startMine(self): pass @_abstractmethod - def getPatterns(self): + def getAssociationRules(self): """ Complete set of frequent patterns generated will be retrieved from this function """ @@ -144,7 +144,7 @@ def save(self, oFile): pass @_abstractmethod - def getPatternsAsDataFrame(self): + def getAssociationRulesAsDataFrame(self): """ Complete set of frequent patterns will be loaded in to data frame from this function """ diff --git a/PAMI/AssociationRules/basic/confidence.py b/PAMI/AssociationRules/basic/confidence.py index f948c405..8c44587f 100644 --- a/PAMI/AssociationRules/basic/confidence.py +++ b/PAMI/AssociationRules/basic/confidence.py @@ -2,23 +2,23 @@ # # **Importing this algorithm into a python program** # -# import PAMI.AssociationRules.basic import ARWithConfidence as alg +# import PAMI.AssociationRules.basic import confidence as alg # # iFile = 'sampleDB.txt' # # minConf = 0.5 # -# obj = alg.ARWithConfidence(iFile, minConf) +# obj = alg.confidence(iFile, minConf) # # obj.mine() # -# associationRules = obj.getPatterns() +# associationRules = obj.getAssociationRules() # # print("Total number of Association Rules:", len(associationRules)) # # obj.save(oFile) # -# Df = obj.getPatternInDataFrame() +# Df = obj.getAssociationRulesAsDataFrame() # # memUSS = obj.getMemoryUSS() # @@ -92,11 +92,11 @@ class confidence: Format: - (.venv) $ python3 ARWithConfidence.py + (.venv) $ python3 confidence.py Example Usage: - (.venv) $ python3 ARWithConfidence.py sampleDB.txt patterns.txt 0.5 ' ' + (.venv) $ python3 confidence.py sampleDB.txt patterns.txt 0.5 ' ' .. note:: minConf can be specified in a value between 0 and 1. @@ -105,23 +105,23 @@ class confidence: .. code-block:: python - import PAMI.AssociationRules.basic import ARWithConfidence as alg + import PAMI.AssociationRules.basic import confidence as alg iFile = 'sampleDB.txt' minConf = 0.5 - obj = alg.ARWithConfidence(iFile, minConf) + obj = alg.confidence(iFile, minConf) obj.mine() - associationRules = obj.getPatterns() + associationRules = obj.getAssociationRules() print("Total number of Association Rules:", len(associationRules)) obj.save(oFile) - Df = obj.getPatternInDataFrame() + Df = obj.getAssociationRulesAsDataFrame() memUSS = obj.getMemoryUSS() @@ -151,7 +151,7 @@ class confidence: _Sep = " " _memoryUSS = float() _memoryRSS = float() - _frequentPatterns = {} + _associationRules = {} def __init__(self, iFile, minConf, sep): """ @@ -164,14 +164,14 @@ def __init__(self, iFile, minConf, sep): """ self._iFile = iFile self._minConf = minConf - self._finalPatterns = {} + self._associationRules = {} self._sep = sep def _readPatterns(self): """ Reading the input file and storing all the frequent patterns and their support respectively in a frequentPatterns variable. """ - self._frequentPatterns = {} + self._associationRules = {} if isinstance(self._iFile, _ab._pd.DataFrame): pattern, support = [], [] if self._iFile.empty: @@ -193,7 +193,7 @@ def _readPatterns(self): If that doesn't work, please raise an issue in the github repository.\ Got pattern: ", pattern[i], "at index: ", i, "in the dataframe, type: ", type(pattern[i])) s = tuple(sorted(pattern[i])) - self._frequentPatterns[s] = support[i] + self._associationRules[s] = support[i] if isinstance(self._iFile, str): if _ab._validators.url(self._iFile): f = _ab._urlopen(self._iFile) @@ -202,7 +202,7 @@ def _readPatterns(self): line = line.split(':') s = line[0].split(self._sep) s = tuple(sorted(s)) - self._frequentPatterns[s] = int(line[1]) + self._associationRules[s] = int(line[1]) else: try: with open(self._iFile, 'r', encoding='utf-8') as f: @@ -212,7 +212,7 @@ def _readPatterns(self): s = line[0].split(self._sep) s = [x.strip() for x in s] s = tuple(sorted(s)) - self._frequentPatterns[s] = int(line[1]) + self._associationRules[s] = int(line[1]) except IOError: print("File Not Found") quit() @@ -235,17 +235,17 @@ def mine(self): self._startTime = _ab._time.time() self._readPatterns() - keys = list(self._frequentPatterns.keys()) + keys = list(self._associationRules.keys()) - for i in range(len(self._frequentPatterns)): - key = self._frequentPatterns[keys[i]] + for i in range(len(self._associationRules)): + key = self._associationRules[keys[i]] for idx in range(len(keys[i]) - 1, 0, -1): for c in combinations(keys[i], r=idx): antecedent = c # consequent = keys[i] - antecedent - conf = key / self._frequentPatterns[antecedent] + conf = key / self._associationRules[antecedent] if conf >= self._minConf: - self._finalPatterns[antecedent + tuple(['->']) + keys[i]] = conf + self._associationRules[antecedent + tuple(['->']) + keys[i]] = conf self._endTime = _ab._time.time() process = _ab._psutil.Process(_ab._os.getpid()) @@ -285,7 +285,7 @@ def getRuntime(self): return self._endTime - self._startTime - def getPatternsAsDataFrame(self): + def getAssociationRulesAsDataFrame(self): """ Storing final frequent patterns in a dataframe @@ -301,7 +301,7 @@ def getPatternsAsDataFrame(self): # # dataFrame = dataFrame.replace(r'\r+|\n+|\t+',' ', regex=True) # return dataFrame - dataFrame = _ab._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support']) + dataFrame = _ab._pd.DataFrame(list(self._associationRules.items()), columns=['Patterns', 'Support']) return dataFrame def save(self, outFile: str) -> None: @@ -314,24 +314,24 @@ def save(self, outFile: str) -> None: :return: None """ with open(outFile, 'w') as f: - for x, y in self._finalPatterns.items(): + for x, y in self._associationRules.items(): x = self._sep.join(x) f.write(f"{x} : {y}\n") - def getPatterns(self): + def getAssociationRules(self): """ Function to send the set of frequent patterns after completion of the mining process :return: returning frequent patterns :rtype: dict """ - return self._finalPatterns + return self._associationRules def printResults(self): """ Function to send the result after completion of the mining process """ - print("Total number of Association Rules:", len(self.getPatterns())) + print("Total number of Association Rules:", len(self.getAssociationRules())) print("Total Memory in USS:", self.getMemoryUSS()) print("Total Memory in RSS", self.getMemoryRSS()) print("Total ExecutionTime in ms:", self.getRuntime()) @@ -346,7 +346,7 @@ def printResults(self): _ap = confidence(_ab._sys.argv[1], _ab._sys.argv[3]) _ap.startMine() _ap.mine() - print("Total number of Association Rules:", len(_ap.getPatterns())) + print("Total number of Association Rules:", len(_ap.getAssociationRules())) _ap.save(_ab._sys.argv[2]) print("Total Memory in USS:", _ap.getMemoryUSS()) print("Total Memory in RSS", _ap.getMemoryRSS()) diff --git a/PAMI/AssociationRules/basic/leverage.py b/PAMI/AssociationRules/basic/leverage.py index 3407f2a2..dcd0956f 100644 --- a/PAMI/AssociationRules/basic/leverage.py +++ b/PAMI/AssociationRules/basic/leverage.py @@ -1,21 +1,20 @@ # This code uses "leverage" metric to extract the association rules from given frequent patterns. # # **Importing this algorithm into a python program** -# ---------------------------------------------------- # -# import PAMI.AssociationRules.basic import ARWithleverage as alg +# import PAMI.AssociationRules.basic import leverage as alg # -# obj = alg.ARWithleverage(iFile, minLev) +# obj = alg.leverage(iFile, minLev) # # obj.mine() # -# associationRules = obj.getPatterns() +# associationRules = obj.getAssociationRules() # # print("Total number of Association Rules:", len(associationRules)) # # obj.save(oFile) # -# Df = obj.getPatternInDataFrame() +# Df = obj.getAssociationRulesAsDataFrame() # # memUSS = obj.getMemoryUSS() # @@ -89,11 +88,11 @@ class leverage: Format: - (.venv) $ python3 ARWithleverage.py + (.venv) $ python3 leverage.py Example Usage: - (.venv) $ python3 ARWithleverage.py sampleDB.txt patterns.txt 0.5 ' ' + (.venv) $ python3 leverage.py sampleDB.txt patterns.txt 0.5 ' ' .. note:: minLev can be specified in a value between 0 and 1. @@ -102,19 +101,19 @@ class leverage: .. code-block:: python - import PAMI.AssociationRules.basic import ARWithleverage as alg + import PAMI.AssociationRules.basic import leverage as alg - obj = alg.ARWithleverage(iFile, minLev) + obj = alg.leverage(iFile, minLev) obj.mine() - associationRules = obj.getPatterns() + associationRules = obj.getAssociationRules() print("Total number of Association Rules:", len(associationRules)) obj.save(oFile) - Df = obj.getPatternInDataFrame() + Df = obj.getAssociationRulesAsDataFrame() memUSS = obj.getMemoryUSS() @@ -144,7 +143,7 @@ class leverage: _Sep = " " _memoryUSS = float() _memoryRSS = float() - _frequentPatterns = {} + _associationRules = {} def __init__(self, iFile, minLev, sep, maxTS): """ @@ -157,7 +156,7 @@ def __init__(self, iFile, minLev, sep, maxTS): """ self._iFile = iFile self._minLev = minLev - self._finalPatterns = {} + self._associationRules = {} self._sep = sep self._maxTS = maxTS @@ -165,7 +164,7 @@ def _readPatterns(self): """ Reading the input file and storing all the frequent patterns and their support respectively in a frequentPatterns variable. """ - self._frequentPatterns = {} + self._associationRules = {} if isinstance(self._iFile, _ab._pd.DataFrame): pattern, support = [], [] if self._iFile.empty: @@ -184,7 +183,7 @@ def _readPatterns(self): raise ValueError("Pattern should be a tuple. PAMI is going through a major revision. Please raise an issue in the github repository regarding this error and provide information regarding input and algorithm.\ In the meanwhile try saving the patterns to a file using (alg).save() and use the file as input. If that doesn't work, please raise an issue in the github repository.") s = tuple(sorted(pattern[i])) - self._frequentPatterns[s] = support[i] / self._maxTS + self._associationRules[s] = support[i] / self._maxTS if isinstance(self._iFile, str): if _ab._validators.url(self._iFile): f = _ab._urlopen(self._iFile) @@ -194,7 +193,7 @@ def _readPatterns(self): s = line[0].split(self._sep) s = tuple(sorted(s)) - self._frequentPatterns[s] = int(line[1]) / self._maxTS + self._associationRules[s] = int(line[1]) / self._maxTS else: try: with open(self._iFile, 'r', encoding='utf-8') as f: @@ -204,7 +203,7 @@ def _readPatterns(self): s = line[0].split(self._sep) s = [x.strip() for x in s] s = tuple(sorted(s)) - self._frequentPatterns[s] = int(line[1]) / self._maxTS + self._associationRules[s] = int(line[1]) / self._maxTS except IOError: print("File Not Found") quit() @@ -227,19 +226,19 @@ def mine(self): self._startTime = _ab._time.time() self._readPatterns() - keys = list(self._frequentPatterns.keys()) + keys = list(self._associationRules.keys()) - for i in range(len(self._frequentPatterns)): - key = self._frequentPatterns[keys[i]] + for i in range(len(self._associationRules)): + key = self._associationRules[keys[i]] for idx in range(len(keys[i]) - 1, 0, -1): for c in combinations(keys[i], r=idx): antecedent = c # consequent = keys[i] - antecedent consequent = tuple(sorted([x for x in keys[i] if x not in antecedent])) # Lev = key / self._frequentPatterns[antecedent] - lev = key - self._frequentPatterns[antecedent] * self._frequentPatterns[consequent] + lev = key - self._associationRules[antecedent] * self._associationRules[consequent] if lev >= self._minLev: - self._finalPatterns[antecedent + tuple(['->']) + keys[i]] = lev + self._associationRules[antecedent + tuple(['->']) + keys[i]] = lev self._endTime = _ab._time.time() process = _ab._psutil.Process(_ab._os.getpid()) @@ -279,7 +278,7 @@ def getRuntime(self): return self._endTime - self._startTime - def getPatternsAsDataFrame(self): + def getAssociationRulesAsDataFrame(self): """ Storing final frequent patterns in a dataframe @@ -295,7 +294,7 @@ def getPatternsAsDataFrame(self): # # dataFrame = dataFrame.replace(r'\r+|\n+|\t+',' ', regex=True) # return dataFrame - dataFrame = _ab._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support']) + dataFrame = _ab._pd.DataFrame(list(self._associationRules.items()), columns=['Patterns', 'Support']) return dataFrame def save(self, outFile: str) -> None: @@ -308,24 +307,24 @@ def save(self, outFile: str) -> None: :return: None """ with open(outFile, 'w') as f: - for x, y in self._finalPatterns.items(): + for x, y in self._associationRules.items(): x = self._sep.join(x) f.write(f"{x} : {y}\n") - def getPatterns(self): + def getAssociationRules(self): """ Function to send the set of frequent patterns after completion of the mining process :return: returning frequent patterns :rtype: dict """ - return self._finalPatterns + return self._associationRules def printResults(self): """ Function to send the result after completion of the mining process """ - print("Total number of Association Rules:", len(self.getPatterns())) + print("Total number of Association Rules:", len(self.getAssociationRules())) print("Total Memory in USS:", self.getMemoryUSS()) print("Total Memory in RSS", self.getMemoryRSS()) print("Total ExecutionTime in ms:", self.getRuntime()) @@ -340,7 +339,7 @@ def printResults(self): _ap = leverage(_ab._sys.argv[1], _ab._sys.argv[3]) _ap.startMine() _ap.mine() - print("Total number of Association Rules:", len(_ap.getPatterns())) + print("Total number of Association Rules:", len(_ap.getAssociationRules())) _ap.save(_ab._sys.argv[2]) print("Total Memory in USS:", _ap.getMemoryUSS()) print("Total Memory in RSS", _ap.getMemoryRSS()) diff --git a/PAMI/AssociationRules/basic/lift.py b/PAMI/AssociationRules/basic/lift.py index 2566b59c..604fc5e6 100644 --- a/PAMI/AssociationRules/basic/lift.py +++ b/PAMI/AssociationRules/basic/lift.py @@ -1,21 +1,20 @@ # This code uses "lift" metric to extract the association rules from given frequent patterns. # # **Importing this algorithm into a python program** -# ---------------------------------------------------- # -# import PAMI.AssociationRules.basic import ARWithlift as alg +# import PAMI.AssociationRules.basic import lift as alg # -# obj = alg.ARWithlift(iFile, minLift) +# obj = alg.lift(iFile, minLift) # # obj.mine() # -# associationRules = obj.getPatterns() +# associationRules = obj.getAssociationRules() # # print("Total number of Association Rules:", len(associationRules)) # # obj.save(oFile) # -# Df = obj.getPatternInDataFrame() +# Df = obj.getPatternsAsDataFrame() # # memUSS = obj.getMemoryUSS() # @@ -89,11 +88,11 @@ class lift: Format: - (.venv) $ python3 ARWithlift.py + (.venv) $ python3 lift.py Example Usage: - (.venv) $ python3 ARWithlift.py sampleDB.txt patterns.txt 0.5 ' ' + (.venv) $ python3 lift.py sampleDB.txt patterns.txt 0.5 ' ' .. note:: minLift can be specified in a value between 0 and 1. @@ -102,19 +101,19 @@ class lift: .. code-block:: python - import PAMI.AssociationRules.basic import ARWithlift as alg + import PAMI.AssociationRules.basic import lift as alg - obj = alg.ARWithlift(iFile, minLift) + obj = alg.lift(iFile, minLift) obj.mine() - associationRules = obj.getPatterns() + associationRules = obj.getAssociationRules() print("Total number of Association Rules:", len(associationRules)) obj.save(oFile) - Df = obj.getPatternInDataFrame() + Df = obj.getPatternsAsDataFrame() memUSS = obj.getMemoryUSS() @@ -144,7 +143,7 @@ class lift: _Sep = " " _memoryUSS = float() _memoryRSS = float() - _frequentPatterns = {} + _associationRules = {} def __init__(self, iFile, minLift, sep): """ @@ -157,14 +156,14 @@ def __init__(self, iFile, minLift, sep): """ self._iFile = iFile self._minLift = minLift - self._finalPatterns = {} + self._associationRules = {} self._sep = sep def _readPatterns(self): """ Reading the input file and storing all the frequent patterns and their support respectively in a frequentPatterns variable. """ - self._frequentPatterns = {} + self._associationRules = {} if isinstance(self._iFile, _ab._pd.DataFrame): pattern, support = [], [] if self._iFile.empty: @@ -183,7 +182,7 @@ def _readPatterns(self): raise ValueError("Pattern should be a tuple. PAMI is going through a major revision. Please raise an issue in the github repository regarding this error and provide information regarding input and algorithm.\ In the meanwhile try saving the patterns to a file using (alg).save() and use the file as input. If that doesn't work, please raise an issue in the github repository.") s = tuple(sorted(pattern[i])) - self._frequentPatterns[s] = support[i] + self._associationRules[s] = support[i] if isinstance(self._iFile, str): if _ab._validators.url(self._iFile): f = _ab._urlopen(self._iFile) @@ -193,7 +192,7 @@ def _readPatterns(self): s = line[0].split(self._sep) s = tuple(sorted(s)) - self._frequentPatterns[s] = int(line[1]) + self._associationRules[s] = int(line[1]) else: try: with open(self._iFile, 'r', encoding='utf-8') as f: @@ -203,7 +202,7 @@ def _readPatterns(self): s = line[0].split(self._sep) s = [x.strip() for x in s] s = tuple(sorted(s)) - self._frequentPatterns[s] = int(line[1]) + self._associationRules[s] = int(line[1]) except IOError: print("File Not Found") quit() @@ -226,18 +225,18 @@ def mine(self): self._startTime = _ab._time.time() self._readPatterns() - keys = list(self._frequentPatterns.keys()) + keys = list(self._associationRules.keys()) - for i in range(len(self._frequentPatterns)): - key = self._frequentPatterns[keys[i]] + for i in range(len(self._associationRules)): + key = self._associationRules[keys[i]] for idx in range(len(keys[i]) - 1, 0, -1): for c in combinations(keys[i], r=idx): antecedent = c consequent = tuple(sorted([x for x in keys[i] if x not in antecedent])) # print(antecedent, consequent) - lift = key / (self._frequentPatterns[antecedent]) * self._frequentPatterns[consequent] + lift = key / (self._associationRules[antecedent]) * self._associationRules[consequent] if lift >= self._minLift: - self._finalPatterns[antecedent + tuple(['->']) + keys[i]] = lift + self._associationRules[antecedent + tuple(['->']) + keys[i]] = lift self._endTime = _ab._time.time() process = _ab._psutil.Process(_ab._os.getpid()) @@ -293,7 +292,7 @@ def getPatternsAsDataFrame(self): # # dataFrame = dataFrame.replace(r'\r+|\n+|\t+',' ', regex=True) # return dataFrame - dataFrame = _ab._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support']) + dataFrame = _ab._pd.DataFrame(list(self._associationRules.items()), columns=['Patterns', 'Support']) return dataFrame def save(self, outFile: str) -> None: @@ -306,24 +305,24 @@ def save(self, outFile: str) -> None: :return: None """ with open(outFile, 'w') as f: - for x, y in self._finalPatterns.items(): + for x, y in self._associationRules.items(): x = self._sep.join(x) f.write(f"{x} : {y}\n") - def getPatterns(self): + def getAssociationRules(self): """ Function to send the set of frequent patterns after completion of the mining process :return: returning frequent patterns :rtype: dict """ - return self._finalPatterns + return self._associationRules def printResults(self): """ Function to send the result after completion of the mining process """ - print("Total number of Association Rules:", len(self.getPatterns())) + print("Total number of Association Rules:", len(self.getAssociationRules())) print("Total Memory in USS:", self.getMemoryUSS()) print("Total Memory in RSS", self.getMemoryRSS()) print("Total ExecutionTime in ms:", self.getRuntime()) @@ -338,7 +337,7 @@ def printResults(self): _ap = lift(_ab._sys.argv[1], _ab._sys.argv[3]) _ap.startMine() _ap.mine() - print("Total number of Association Rules:", len(_ap.getPatterns())) + print("Total number of Association Rules:", len(_ap.getAssociationRules())) _ap.save(_ab._sys.argv[2]) print("Total Memory in USS:", _ap.getMemoryUSS()) print("Total Memory in RSS", _ap.getMemoryRSS()) diff --git a/PAMI/frequentPattern/basic/ECLAT.py b/PAMI/frequentPattern/basic/ECLAT.py index 7586ca7f..c4466ccf 100644 --- a/PAMI/frequentPattern/basic/ECLAT.py +++ b/PAMI/frequentPattern/basic/ECLAT.py @@ -59,21 +59,21 @@ class ECLAT(_ab._frequentPatterns): About this algorithm ==================== - :**Description**: ECLAT is one of the fundamental algorithm to discover frequent patterns in a transactional database. + :**Description**: *ECLAT is one of the fundamental algorithm to discover frequent patterns in a transactional database.* :**Reference**: Mohammed Javeed Zaki: Scalable Algorithms for Association Mining. IEEE Trans. Knowl. Data Eng. 12(3): - 372-390 (2000), https://ieeexplore.ieee.org/document/846291 + 372-390 (2000), https://ieeexplore.ieee.org/document/846291 :**Parameters**: - **iFile** (*str or URL or dataFrame*) -- *Name of the Input file to mine complete set of frequent patterns.* - - **oFile** (*str*) -- *Name of the output file to store complete set of frequent patterns.* - - **minSup** (*int or float or str*) -- *The user can specify minSup either in count or proportion of database size. If the program detects the data type of minSup is integer, then it treats minSup is expressed in count. Otherwise, it will be treated as float.* - - **sep** (*str*) -- *This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.* + - **oFile** (*str*) -- *Name of the Output file to store the frequent patterns.* + - **minSup** (*int or float or str*) -- The user can specify minSup either in count or proportion of database size. If the program detects the data type of minSup is integer, then it treats minSup is expressed in count. + - **sep** (*str*) -- This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator. :**Attributes**: - **startTime** (*float*) -- *To record the start time of the mining process.* - - **endTime** (*float*) -- *To record the completion time of the mining process.* + - **endTime** (*float*) -- *To record the end time of the mining process.* - **finalPatterns** (*dict*) -- *Storing the complete set of patterns in a dictionary variable.* - **memoryUSS** (*float*) -- *To store the total amount of USS memory consumed by the program.* - - **memoryRSS** (*float*) -- *To store the total amount of RSS memory consumed by the program.* + - **memoryRSS** *(float*) -- *To store the total amount of RSS memory consumed by the program.* - **Database** (*list*) -- *To store the transactions of a database in list.* Execution methods @@ -183,6 +183,58 @@ def _creatingItemSets(self) -> float: print("File Not Found") quit() + def _getUniqueItemList(self) -> list: + """ + + Generating one frequent patterns + + :return: list of unique patterns + :rtype: list + """ + self._finalPatterns = {} + candidate = {} + uniqueItem = [] + for i in range(len(self._Database)): + for j in range(len(self._Database[i])): + if self._Database[i][j] not in candidate: + candidate[self._Database[i][j]] = {i} + else: + candidate[self._Database[i][j]].add(i) + for key, value in candidate.items(): + supp = len(value) + if supp >= self._minSup: + self._finalPatterns[key] = [value] + uniqueItem.append(key) + uniqueItem.sort() + return uniqueItem + + def _generateFrequentPatterns(self, candidateFrequent: list) -> None: + """ + + It will generate the combinations of frequent items + + :param candidateFrequent :it represents the items with their respective transaction identifiers + :type candidateFrequent: list + :return: None + """ + new_freqList = [] + for i in range(0, len(candidateFrequent)): + item1 = candidateFrequent[i] + i1_list = item1.split() + for j in range(i + 1, len(candidateFrequent)): + item2 = candidateFrequent[j] + i2_list = item2.split() + if i1_list[:-1] == i2_list[:-1]: + interSet = self._finalPatterns[item1][0].intersection(self._finalPatterns[item2][0]) + if len(interSet) >= self._minSup: + newKey = item1 + "\t" + i2_list[-1] + self._finalPatterns[newKey] = [interSet] + new_freqList.append(newKey) + else: break + + if len(new_freqList) > 0: + self._generateFrequentPatterns(new_freqList) + def _convert(self, value) -> float: """ @@ -212,30 +264,6 @@ def startMine(self) -> None: self.mine() - def __recursive(self, items, cands): - """ - - This function generates new candidates by taking input as original candidates. - - :param items: A dictionary containing items and their corresponding support values. - :type items: dict - :param cands: A list of candidate itemsets. - :type cands: list - :return: None - """ - - for i in range(len(cands)): - newCands = [] - for j in range(i + 1, len(cands)): - intersection = items[cands[i]].intersection(items[cands[j]]) - if len(intersection) >= self._minSup: - newCand = tuple(cands[i] + tuple([cands[j][-1]])) - newCands.append(newCand) - items[newCand] = intersection - self._finalPatterns[newCand] = len(intersection) - if len(newCands) > 1: - self.__recursive(items, newCands) - def mine(self) -> None: """ Frequent pattern mining process will start from here @@ -247,29 +275,11 @@ def mine(self) -> None: if self._minSup is None: raise Exception("Please enter the Minimum Support") self._creatingItemSets() - self._minSup = self._convert(self._minSup) - - - items = {} - index = 0 - for line in self._Database: - for item in line: - if item not in items: - items[item] = [] - items[item].append(index) - index += 1 - - items = {tuple([k]): set(v) for k, v in items.items() if len(v) >= self._minSup} - items = {k: v for k, v in sorted(items.items(), key=lambda item: len(item[1]), reverse=False)} - for k, v in items.items(): - self._finalPatterns[k] = len(v) - - cands = list(items.keys()) - - self.__recursive(items, cands) - - + uniqueItemList = self._getUniqueItemList() + self._generateFrequentPatterns(uniqueItemList) + for x, y in self._finalPatterns.items(): + self._finalPatterns[x] = len(y[0]) self._endTime = _ab._time.time() process = _ab._psutil.Process(_ab._os.getpid()) self._memoryUSS = float() @@ -319,18 +329,11 @@ def getPatternsAsDataFrame(self) -> _ab._pd.DataFrame: :rtype: pd.DataFrame """ - # time = _ab._time.time() - # dataFrame = {} - # data = [] - # for a, b in self._finalPatterns.items(): - # # data.append([a.replace('\t', ' '), b]) - # data.append([" ".join(a), b]) - # dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) - # print("Time taken to convert the frequent patterns into DataFrame is: ", _ab._time.time() - time) - - - dataFrame = _ab._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support']) - + dataFrame = {} + data = [] + for a, b in self._finalPatterns.items(): + data.append([a.replace('\t', ' '), b]) + dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) return dataFrame def save(self, outFile: str) -> None: @@ -342,13 +345,15 @@ def save(self, outFile: str) -> None: :type outFile: csvfile :return: None """ - with open(outFile, 'w') as f: - for x, y in self._finalPatterns.items(): - x = self._sep.join(x) - f.write(f"{x} : {y}\n") + self._oFile = outFile + writer = open(self._oFile, 'w+') + for x, y in self._finalPatterns.items(): + patternsAndSupport = x.strip() + ":" + str(y) + writer.write("%s \n" % patternsAndSupport) def getPatterns(self) -> dict: """ + Function to send the set of frequent patterns after completion of the mining process :return: returning frequent patterns @@ -383,6 +388,3 @@ def printResults(self) -> None: print("Total ExecutionTime in ms:", _ap.getRuntime()) else: print("Error! The number of input parameters do not match the total number of parameters provided") - - - diff --git a/PAMI/frequentPattern/basic/ECLATDiffset.py b/PAMI/frequentPattern/basic/ECLATDiffset.py index c382679e..b9970a1a 100644 --- a/PAMI/frequentPattern/basic/ECLATDiffset.py +++ b/PAMI/frequentPattern/basic/ECLATDiffset.py @@ -1,10 +1,13 @@ # ECLATDiffest uses diffset to extract the frequent patterns in a transactional database. - +# # **Importing this algorithm into a python program** -# --------------------------------------------------------- # # import PAMI.frequentPattern.basic.ECLATDiffset as alg # +# iFile = 'sampleDB.txt' +# +# minSup = 10 # can also be specified between 0 and 1 +# # obj = alg.ECLATDiffset(iFile, minSup) # # obj.mine() @@ -31,10 +34,6 @@ # - - - - __copyright__ = """ Copyright (C) 2021 Rage Uday Kiran @@ -61,43 +60,28 @@ class ECLATDiffset(_ab._frequentPatterns): """ - :Description: ECLATDiffset uses diffset to extract the frequent patterns in a transactional database. + :**Description**: ECLATDiffset uses diffset to extract the frequent patterns in a transactional database. - :Reference: KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining - August 2003 Pages 326–335 https://doi.org/10.1145/956750.956788 + :**Reference**: KDD '03: Proceedings of the ninth ACM SIGKDD international conference on Knowledge discovery and data mining + August 2003 Pages 326–335 https://doi.org/10.1145/956750.956788 - :param iFile: str : - Name of the Input file to mine complete set of frequent pattern's - :param oFile: str : - Name of the output file to store complete set of frequent patterns - :param minSup: int or float or str : - The user can specify minSup either in count or proportion of database size. If the program detects the data type of minSup is integer, then it treats minSup is expressed in count. - :param sep: str : - This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator. - - :Attributes: - - startTime : float - To record the start time of the mining process - - endTime : float - To record the completion time of the mining process - - finalPatterns : dict - Storing the complete set of patterns in a dictionary variable - - memoryUSS : float - To store the total amount of USS memory consumed by the program - - memoryRSS : float - To store the total amount of RSS memory consumed by the program - - Database : list - To store the transactions of a database in list + :**Parameters**: - **iFile** (*str or URL or dataFrame*) -- *Name of the Input file to mine complete set of frequent patterns.* + - **oFile** (*str*) -- *Name of the output file to store complete set of frequent patterns* + - **minSup** (*int or float or str*) -- *The user can specify minSup either in count or proportion of database size. If the program detects the data type of minSup is integer, then it treats minSup is expressed in count.* + - **sep** (*str*) -- **This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.** + + :**Attributes**: - **startTime** (*float*) -- *To record the start time of the mining process.* + - **endTime** (*float*) -- *To record the end time of the mining process.* + - **finalPatterns** (*dict*) -- *Storing the complete set of patterns in a dictionary variable.* + - **memoryUSS** (*float*) -- *To store the total amount of USS memory consumed by the program.* + - **memoryRSS** *(float*) -- *To store the total amount of RSS memory consumed by the program.* + - **Database** (*list*) -- *To store the transactions of a database in list.* - **Methods to execute code on terminal** - ------------------------------------------ + Execution methods + ================= + + **Terminal command** .. code-block:: console @@ -109,15 +93,19 @@ class ECLATDiffset(_ab._frequentPatterns): (.venv) $ python3 ECLATDiffset.py sampleDB.txt patterns.txt 10.0 - .. note:: minSup will be considered in percentage of database transactions + .. note:: minSup can be specified in support count or a value between 0 and 1. - **Importing this algorithm into a python program** - --------------------------------------------------------- + **Calling from a python program** + .. code-block:: python import PAMI.frequentPattern.basic.ECLATDiffset as alg + iFile = 'sampleDB.txt' + + minSup = 10 # can also be specified between 0 and 1 + obj = alg.ECLATDiffset(iFile, minSup) obj.mine() @@ -143,10 +131,10 @@ class ECLATDiffset(_ab._frequentPatterns): print("Total ExecutionTime in seconds:", run) - **Credits:** - ------------------- + Credits: + ======== - The complete program was written by Kundai under the supervision of Professor Rage Uday Kiran. + The complete program was written by Kundai and revised by Tarun Sreepada under the supervision of Professor Rage Uday Kiran. """ @@ -197,7 +185,9 @@ def _creatingItemSets(self): def _convert(self, value): """ + To convert the user specified minSup value + :param value: user specified minSup value :return: converted type """ @@ -243,7 +233,9 @@ def _getUniqueItemList(self): def _runDeclat(self, candidateList): """ + It will generate the combinations of frequent items + :param candidateList :it represents the items with their respective transaction identifiers :type candidateList: list :return: returning transaction dictionary @@ -275,7 +267,6 @@ def startMine(self): """ Frequent pattern mining process will start from here """ - self.mine() def mine(self): @@ -310,7 +301,9 @@ def mine(self): def getMemoryUSS(self): """ + Total amount of USS memory consumed by the mining process will be retrieved from this function + :return: returning USS memory consumed by the mining process :rtype: float """ @@ -319,7 +312,9 @@ def getMemoryUSS(self): def getMemoryRSS(self): """ + Total amount of RSS memory consumed by the mining process will be retrieved from this function + :return: returning RSS memory consumed by the mining process :rtype: float """ @@ -328,7 +323,9 @@ def getMemoryRSS(self): def getRuntime(self): """ + Calculating the total amount of runtime taken by the mining process + :return: returning total amount of runtime taken by the mining process :rtype: float """ @@ -337,7 +334,9 @@ def getRuntime(self): def getPatternsAsDataFrame(self): """ + Storing final frequent patterns in a dataframe + :return: returning frequent patterns in a dataframe :rtype: pd.DataFrame """ @@ -351,7 +350,9 @@ def getPatternsAsDataFrame(self): def save(self, outFile): """ + Complete set of frequent patterns will be loaded in to an output file + :param outFile: name of the output file :type outFile: csvfile """ @@ -363,7 +364,9 @@ def save(self, outFile): def getPatterns(self): """ + Function to send the set of frequent patterns after completion of the mining process + :return: returning frequent patterns :rtype: dict """ diff --git a/PAMI/frequentPattern/basic/ECLATbitset.py b/PAMI/frequentPattern/basic/ECLATbitset.py index c1d12ef2..e5b2d9d9 100644 --- a/PAMI/frequentPattern/basic/ECLATbitset.py +++ b/PAMI/frequentPattern/basic/ECLATbitset.py @@ -57,24 +57,21 @@ class ECLATbitset(_ab._frequentPatterns): """ - About this algorithm - ==================== - :*Description*: ECLATbitset is one of the fundamental algorithm to discover frequent patterns in a transactional database. :*Reference*: Mohammed Javeed Zaki: Scalable Algorithms for Association Mining. IEEE Trans. Knowl. Data Eng. 12(3): - 372-390 (2000), https://ieeexplore.ieee.org/document/846291 + 372-390 (2000), https://ieeexplore.ieee.org/document/846291 :**Parameters**: - **iFile** (*str or URL or dataFrame*) -- *Name of the Input file to mine complete set of frequent patterns.* - - **oFile** (*str*) -- *Name of the output file to store complete set of frequent patterns.* - - **minSup** (*int or float or str*) -- *The user can specify minSup either in count or proportion of database size. If the program detects the data type of minSup is integer, then it treats minSup is expressed in count. Otherwise, it will be treated as float.* - - **sep** (*str*) -- *This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.* + - **oFile** (*str*) -- *Name of the output file to store complete set of frequent patterns* + - **minSup** (*int or float or str*) -- *The user can specify minSup either in count or proportion of database size. If the program detects the data type of minSup is integer, then it treats minSup is expressed in count.* + - **sep** (*str*) -- **This variable is used to distinguish items from one another in a transaction. The default seperator is tab space. However, the users can override their default separator.** :**Attributes**: - **startTime** (*float*) -- *To record the start time of the mining process.* - - **endTime** (*float*) -- *To record the completion time of the mining process.* + - **endTime** (*float*) -- *To record the end time of the mining process.* - **finalPatterns** (*dict*) -- *Storing the complete set of patterns in a dictionary variable.* - **memoryUSS** (*float*) -- *To store the total amount of USS memory consumed by the program.* - - **memoryRSS** (*float*) -- *To store the total amount of RSS memory consumed by the program.* + - **memoryRSS** *(float*) -- *To store the total amount of RSS memory consumed by the program.* - **Database** (*list*) -- *To store the transactions of a database in list.* Execution methods @@ -82,8 +79,6 @@ class ECLATbitset(_ab._frequentPatterns): **Terminal command** - .. code-block:: console - Format: (.venv) $ python3 ECLATbitset.py @@ -151,6 +146,7 @@ class ECLATbitset(_ab._frequentPatterns): def _convert(self, value): """ + To convert the user specified minSup value :param value: user specified minSup value @@ -208,6 +204,7 @@ def _creatingItemSets(self): def startMine(self): """ Frequent pattern mining process will start from here + We start with the scanning the itemSets and store the bitsets respectively. We form the combinations of single items and check with minSup condition to check the frequency of patterns """ @@ -215,6 +212,7 @@ def startMine(self): def _bitPacker(self, data, maxIndex): """ + It takes the data and maxIndex as input and generates integer as output value. :param data: it takes data as input. @@ -227,31 +225,6 @@ def _bitPacker(self, data, maxIndex): packed_bits |= 1 << (maxIndex - i) return packed_bits - - def __recursive(self, items, cands): - """ - - This function generates new candidates by taking input as original candidates. - - :param items: A dictionary containing items and their corresponding support values. - :type items: dict - :param cands: A list of candidate itemsets. - :type cands: list - :return: None - """ - - for i in range(len(cands)): - newCands = [] - for j in range(i + 1, len(cands)): - intersection = items[cands[i]] & items[cands[j]] - support = int.bit_count(intersection) - if support >= self._minSup: - newCand = tuple(cands[i] + tuple([cands[j][-1]])) - newCands.append(newCand) - items[newCand] = intersection - self._finalPatterns[newCand] = support - if len(newCands) > 1: - self.__recursive(items, newCands) def mine(self) -> None: """ @@ -275,15 +248,35 @@ def mine(self) -> None: index += 1 # sort by length in descending order - items = dict(sorted(items.items(), key=lambda x: len(x[1]), reverse=False)) + items = dict(sorted(items.items(), key=lambda x: len(x[1]), reverse=True)) cands = [] for key in items: if len(items[key]) >= self._minSup: - self._finalPatterns[key] = len(items[key]) + self._finalPatterns["\t".join(key)] = len(items[key]) cands.append(key) items[key] = self._bitPacker(items[key], index) + # print(key, items[key]) + else: + break - self.__recursive(items, cands) + while cands: + newCands = [] + for i in range(len(cands)): + for j in range(i + 1, len(cands)): + if cands[i][:-1] == cands[j][:-1]: + newCand = tuple(cands[i] + tuple([cands[j][-1]])) + intersection = items[tuple([newCand[0]])] + for k in range(1, len(newCand)): + intersection &= items[tuple([newCand[k]])] + count = int.bit_count(intersection) + if count >= self._minSup: + newCands.append(newCand) + newCand = "\t".join(newCand) + self._finalPatterns[newCand] = count + else: + break + + cands = newCands self._endTime = _ab._time.time() process = _ab._psutil.Process(_ab._os.getpid()) @@ -326,7 +319,7 @@ def getRuntime(self): return self._endTime - self._startTime - def getPatternsAsDataFrame(self) -> _ab._pd.DataFrame: + def getPatternsAsDataFrame(self): """ Storing final frequent patterns in a dataframe @@ -335,34 +328,26 @@ def getPatternsAsDataFrame(self) -> _ab._pd.DataFrame: :rtype: pd.DataFrame """ - # time = _ab._time.time() - # dataFrame = {} - # data = [] - # for a, b in self._finalPatterns.items(): - # # data.append([a.replace('\t', ' '), b]) - # data.append([" ".join(a), b]) - # dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) - # print("Time taken to convert the frequent patterns into DataFrame is: ", _ab._time.time() - time) - - - dataFrame = _ab._pd.DataFrame(list(self._finalPatterns.items()), columns=['Patterns', 'Support']) - + dataFrame = {} + data = [] + for a, b in self._finalPatterns.items(): + data.append([a.replace('\t', ' '), b]) + dataFrame = _ab._pd.DataFrame(data, columns=['Patterns', 'Support']) return dataFrame - def save(self, outFile: str) -> None: + def save(self, outFile): """ Complete set of frequent patterns will be loaded in to an output file - :param outFile: name of the output file - :type outFile: csvfile - :return: None + :param outFile: name of the outputfile + :type outFile: file """ - with open(outFile, 'w') as f: - for x, y in self._finalPatterns.items(): - x = self._sep.join(x) - f.write(f"{x} : {y}\n") - + self._oFile = outFile + writer = open(self._oFile, 'w+') + for x, y in self._finalPatterns.items(): + patternsAndSupport = x.strip() + ":" + str(y) + writer.write("%s \n" % patternsAndSupport) def getPatterns(self): """ @@ -399,6 +384,4 @@ def printResults(self): print("Total Memory in RSS", _ap.getMemoryRSS()) print("Total ExecutionTime in ms:", _ap.getRuntime()) else: - print("Error! The number of input parameters do not match the total number of parameters provided") - - + print("Error! The number of input parameters do not match the total number of parameters provided") \ No newline at end of file