Skip to content

Commit

Permalink
Let apriori always use low_memory processing
Browse files Browse the repository at this point in the history
Thanks to previous optimizations, processing with low_memory=True is
now almost as efficient as with low_memory=False, and allows to process
much larger datasets.

Removing processing with low_memory=False makes code simpler and allows
to generate itemsets by a generator, which saves more meory.

The downside is that we do not know in advance the number of itemsets to
process, thus it is displayed afterwards.  Note that commit 2f928cb
introduced a bug, the number of processing combinations was multiplied
by itemset's length, which explains why output is different now.
  • Loading branch information
dbarbier committed Dec 20, 2019
1 parent 58c95f1 commit 4c82dcf
Showing 1 changed file with 35 additions and 52 deletions.
87 changes: 35 additions & 52 deletions mlxtend/frequent_patterns/apriori.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,40 +61,36 @@ def generate_new_combinations(old_combinations):
# early exit from for-loop skips else clause just below
break
else:
yield from candidate
yield candidate
j = j + 1


def compute_supports_low_memory(X, is_sparse, combin):
supports = np.zeros(combin.shape[0])
ncomb, nitems = combin.shape
def generate_supports_and_itemsets(X, is_sparse, combin, min_support):
counter = 0
if is_sparse:
_bools = X[:, 0].toarray()
for c in range(ncomb):
_bools[:] = X[:, combin[c, 0]].toarray()
for j in range(1, nitems):
_bools[:] &= X[:, combin[c, j]].toarray()
supports[c] = np.count_nonzero(_bools)
count = np.empty(X.shape[0], dtype=int)
for itemset in combin:
counter += 1
count[:] = 0
for item in itemset:
# much faster than X[:, item].toarray() or X.getcol(item).indices
count[X.indices[X.indptr[item]:X.indptr[item+1]]] += 1
support = np.count_nonzero(count == len(itemset))
if support >= min_support:
yield support
yield from itemset
else:
_bools = np.copy(X[:, 0])
for c in range(ncomb):
_bools[:] = X[:, combin[c, 0]]
for j in range(1, nitems):
_bools[:] &= X[:, combin[c, j]]
supports[c] = np.count_nonzero(_bools)
return supports


def compute_supports(X, is_sparse, combin):
all_ones = np.ones((X.shape[0], 1))
if is_sparse:
_bools = X[:, combin[:, 0]] == all_ones
for n in range(1, combin.shape[1]):
_bools = _bools & (X[:, combin[:, n]] == all_ones)
else:
_bools = np.all(X[:, combin], axis=2)

return np.sum(_bools, axis=0)
for itemset in combin:
counter += 1
_bools = np.ones(X.shape[0], dtype=bool)
for item in itemset:
_bools[:] &= X[:, item]
support = np.count_nonzero(_bools)
if support >= min_support:
yield support
yield from itemset
# return the total of processed itemsets as last element
yield counter


def apriori(df, min_support=0.5, use_colnames=False, max_len=None, verbose=0,
Expand Down Expand Up @@ -223,38 +219,25 @@ def _support(_x, _n_rows, _is_sparse):
support_dict = {1: support[support >= min_support]}
itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
max_itemset = 1
rows_count = float(X.shape[0])

all_ones = np.ones((int(rows_count), 1))

while max_itemset and max_itemset < (max_len or float('inf')):
next_max_itemset = max_itemset + 1

combin = generate_new_combinations(itemset_dict[max_itemset])
combin = np.fromiter(combin, dtype=int)
combin = combin.reshape(-1, next_max_itemset)
gen_itemsets = generate_supports_and_itemsets(X, is_sparse, combin, int(min_support * X.shape[0]))

support_valid_itemsets = np.fromiter(gen_itemsets, dtype=int)
processed_itemsets = support_valid_itemsets[-1]

if combin.size == 0:
break
if verbose:
print(
'\rProcessing %d combinations | Sampling itemset size %d' %
(combin.size, next_max_itemset), end="")

# With exceptionally large datasets, the matrix operations can use a
# substantial amount of memory. For low memory applications or large
# datasets, set `low_memory=True` to use a slower but more memory-
# efficient implementation.
if low_memory:
support = compute_supports_low_memory(X, is_sparse, combin)
else:
support = compute_supports(X, is_sparse, combin)
support /= rows_count
'\rProcessed %d combinations | Sampling itemset size %d' %
(processed_itemsets, next_max_itemset), end="")

_mask = (support >= min_support)
if any(_mask):
itemset_dict[next_max_itemset] = np.array(combin[_mask])
support_dict[next_max_itemset] = np.array(support[_mask])
support_valid_itemsets = support_valid_itemsets[:-1].reshape(-1, 1 + next_max_itemset)
if support_valid_itemsets.size > 0:
itemset_dict[next_max_itemset] = support_valid_itemsets[:, 1:]
support_dict[next_max_itemset] = support_valid_itemsets[:, 0] / X.shape[0]
max_itemset = next_max_itemset
else:
# Exit condition
Expand Down

0 comments on commit 4c82dcf

Please sign in to comment.