Skip to content

Commit

Permalink
Merge branch 'UdayLab:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
vanithakattumuri authored Apr 30, 2024
2 parents 96831d7 + be4635a commit a096a7f
Show file tree
Hide file tree
Showing 4 changed files with 144 additions and 5 deletions.
8 changes: 3 additions & 5 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,8 @@ build:

# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: finalSphinxDocs/conf.py
fail_on_warning: false

#true if you want to stop the building process
configuration: sphinx/conf.py
fail_on_warning: false #true

# Optionally build your docs in additional formats such as PDF and ePub
formats:
Expand All @@ -37,4 +35,4 @@ formats:
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: binder/requirements.txt
- requirements: binder/requirements.txt
136 changes: 136 additions & 0 deletions PAMI/frequentPattern/basic/Apriori2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
from typing import Dict, List, Set, Union
from datetime import datetime
from deprecated import deprecated
import pandas as pd
import psutil
import os

class Apriori:
"""
Apriori algorithm for frequent pattern mining in transactional databases.
Args:
iFile (str): Input file name or path of the input file.
minSup (Union[int, float, str]): Minimum support threshold. If int, treated as count. If float, treated as proportion of database size.
sep (str, optional): Separator used to distinguish items from each other in a transaction. Default is '\t'.
Attributes:
minSup (float): Minimum support threshold.
startTime (float): Start time of the mining process.
endTime (float): End time of the mining process.
frequentPatterns (Dict[str, int]): Dictionary storing the complete set of patterns.
database (List[Set[str]]): List to store transactions of the database.
Methods:
mine(): Perform the frequent pattern mining process.
getMemoryUsage(): Get the total memory consumed.
getRuntime(): Get the total runtime of the mining process.
getPatternsAsDataFrame(): Get frequent patterns as a DataFrame.
savePatterns(outFile): Save the final patterns into a file.
getPatterns(): Get the set of frequent patterns.
printResults(): Print the results of the execution.
"""

def __init__(self, iFile: str, minSup: Union[int, float, str], sep: str = '\t'):
self.minSup = self._convertMinSup(minSup)
self.startTime = 0.0
self.endTime = 0.0
self.frequentPatterns = {}
self.database = self._loadDatabase(iFile, sep)

def _convertMinSup(self, minSup: Union[int, float, str]) -> float:
if isinstance(minSup, int):
return minSup
elif isinstance(minSup, float):
return len(self.database) * minSup
elif isinstance(minSup, str):
if '.' in minSup:
return len(self.database) * float(minSup)
else:
return int(minSup)

def _loadDatabase(self, iFile: str, sep: str) -> List[Set[str]]:
database = []
with open(iFile, 'r') as f:
for line in f:
items = line.strip().split(sep)
database.append(set(items))
return database

def mine(self) -> None:
"""
Perform the frequent pattern mining process.
"""
self.startTime = datetime.now()
candidates = [{item} for transaction in self.database for item in transaction]
frequentSets = []
while candidates:
counts = self._countCandidates(candidates)
frequentSets.extend([c for c in candidates if counts[tuple(c)] >= self.minSup])
candidates = self._generateCandidates(frequentSets)
self.frequentPatterns = {self._setToStr(pattern): self._getSupport(pattern) for pattern in frequentSets}
self.endTime = datetime.now()

def _countCandidates(self, candidates: List[Set[str]]) -> Dict[tuple, int]:
counts = {}
for transaction in self.database:
for candidate in candidates:
if candidate.issubset(transaction):
counts[tuple(candidate)] = counts.get(tuple(candidate), 0) + 1
return counts

def _generateCandidates(self, frequentSets: List[Set[str]]) -> List[Set[str]]:
newCandidates = []
for i, pattern1 in enumerate(frequentSets):
for pattern2 in frequentSets[i + 1:]:
if list(pattern1)[:-1] == list(pattern2)[:-1]:
newCandidate = pattern1.union(pattern2)
if all(self._isSubset(subset, frequentSets) for subset in self._getSubsets(newCandidate)):
newCandidates.append(newCandidate)
return newCandidates

def _isSubset(self, subset: Set[str], superset: List[Set[str]]) -> bool:
return any(subset.issubset(pattern) for pattern in superset)

def _getSubsets(self, pattern: Set[str]) -> List[Set[str]]:
return [set(subset) for subset in self._powerSet(pattern) if subset]

def _powerSet(self, pattern: Set[str]) -> List[List[str]]:
return [list(subset) for i in range(len(pattern) + 1) for subset in combinations(pattern, i)]

def _setToStr(self, pattern: Set[str]) -> str:
return '\t'.join(sorted(pattern))

def _getSupport(self, pattern: Set[str]) -> int:
return sum(pattern.issubset(transaction) for transaction in self.database)

def getMemoryUsage(self) -> float:
"""
Get the total memory consumed.
Returns:
float: Total memory consumed.
"""
process = psutil.Process(os.getpid())
return process.memory_full_info().uss

def getRuntime(self) -> float:
"""
Get the total runtime of the mining process.
Returns:
float: Total runtime in seconds.
"""
return (self.endTime - self.startTime).total_seconds()

def getPatternsAsDataFrame(self) -> pd.DataFrame:
"""
Get frequent patterns as a DataFrame.
Returns:
pd.DataFrame: DataFrame containing frequent patterns.
"""
data = [[pattern, support] for pattern, support in self.frequentPatterns.items()]
return pd.DataFrame(data, columns=['Patterns', 'Support'])

def savePatterns(self, outFile:
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# Table of Contents

- [Introduction](#introduction)
- [Development process](#process-flow-chart)
- [Recent updates](#recent-updates)
- [Features](#features)
- [Maintenance](#Maintenance)
Expand Down Expand Up @@ -59,7 +60,11 @@ PAttern MIning (PAMI) is a Python library containing several algorithms to disco
8. Report issues https://github.com/UdayLab/PAMI/issues

***
# Process Flow Chart

<img width="573" alt="PAMI's production process" src="https://github.com/udayLab/PAMI/images/pamiDevelopmentSteps.png">

***
# Recent Updates

- Version 2023.07.07: New algorithms: cuApriroi, cuAprioriBit, cuEclat, cuEclatBit, gPPMiner, cuGPFMiner, FPStream, HUPMS, SHUPGrowth New codes to generate synthetic databases
Expand Down
Binary file added images/pamiDevelopmentSteps.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit a096a7f

Please sign in to comment.