Skip to content

Commit

Permalink
Add get_synthetic_dataset function to util (apache#146)
Browse files Browse the repository at this point in the history
* Add get_synthetic_datasets

* Move to test_utils

* Remove _get_uniform_dataset

* Move validation to its own function

* Refactor the validation code for csr generation

* Make test_powerlaw a nested function

* Change SparseNDArray to CSRNDArray

* Merge with dtype specific changes in test_utils
  • Loading branch information
anirudh2290 authored and eric-haibin-lin committed Aug 10, 2017
1 parent 8040953 commit 2d93d72
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 6 deletions.
136 changes: 130 additions & 6 deletions python/mxnet/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,130 @@ def random_sample(population, k):
return population_copy[0:k]


def rand_sparse_ndarray(shape, stype, density=None, dtype=None):
"""Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np) """
def _validate_csr_generation_inputs(num_rows, num_cols, density,
distribution="uniform"):
"""Validates inputs for csr generation helper functions
"""
total_nnz = int(num_rows * num_cols * density)
if density < 0 or density > 1:
raise ValueError("density has to be between 0 and 1")

if num_rows <= 0 or num_cols <= 0:
raise ValueError("num_rows or num_cols should be greater than 0")

if distribution == "powerlaw":
if total_nnz < 2 * num_rows:
raise ValueError("not supported for this density: %s"
" for this shape (%s, %s)"
" Please keep :"
" num_rows * num_cols * density >= 2 * num_rows"
% (density, num_rows, num_cols))


def _get_uniform_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
"""Returns CSRNDArray with uniform distribution
This generates a csr matrix with totalnnz unique randomly chosen numbers
from num_rows*num_cols and arranges them in the 2d array in the
following way: row_index = (random_number_generated / num_rows)
col_index = random_number_generated - row_index * num_cols
"""
_validate_csr_generation_inputs(num_rows, num_cols, density,
distribution="uniform")
csr = sp.rand(num_rows, num_cols, density, dtype=dtype, format="csr")
result = mx.nd.csr_matrix(csr.data, csr.indptr, csr.indices,
(num_rows, num_cols), dtype=dtype)
return result


def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
"""Returns CSRNDArray with powerlaw distribution
with exponentially increasing number of non zeros in each row.
Not supported for cases where total_nnz < 2*num_rows. This is because
the algorithm first tries to ensure that there are rows with no zeros by
putting non zeros at beginning of each row.
"""

_validate_csr_generation_inputs(num_rows, num_cols, density,
distribution="powerlaw")

total_nnz = int(num_rows * num_cols * density)

unused_nnz = total_nnz
output_arr = np.zeros((num_rows, num_cols), dtype=dtype)
# Start with ones on each row so that no row is empty
for row in range(num_rows):
output_arr[row][0] = 1 + rnd.uniform(0.001, 2)
unused_nnz = unused_nnz - 1
if unused_nnz <= 0:
return mx.nd.array(output_arr).tostype("csr")

# Populate rest of matrix with 2^i items in ith row.
# if we have used all total nnz return the sparse matrix
# else if we reached max column size then fill up full columns until we use all nnz
col_max = 2
for row in range(num_rows):
col_limit = min(num_cols, col_max)
# In case col_limit reached assign same value to all elements, which is much faster
if col_limit == num_cols and unused_nnz > col_limit:
output_arr[row] = 1 + rnd.uniform(0.001, 2)
unused_nnz = unused_nnz - col_limit + 1
if unused_nnz <= 0:
return mx.nd.array(output_arr).tostype("csr")
else:
continue
for col_index in range(1, col_limit):
output_arr[row][col_index] = 1 + rnd.uniform(0.001, 2)
unused_nnz = unused_nnz - 1
if unused_nnz <= 0:
return mx.nd.array(output_arr).tostype("csr")
col_max = col_max * 2

if unused_nnz > 0:
#return mx.nd.array(sp.random(num_rows, num_cols, density).toarray()).tostype("csr")
raise ValueError("not supported for this density: %s"
" for this shape (%s,%s)" % (density, num_rows, num_cols))
else:
return mx.nd.array(output_arr).tostype("csr")


def rand_sparse_ndarray(shape, stype, density=None, distribution="uniform", dtype=None):
"""Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np)
Parameters
----------
shape: list or tuple
stype: str, valid values: "csr" or "row_sparse"
density, optional: float, should be between 0 and 1
distribution, optional: str, valid values: "uniform" or "powerlaw"
dtype, optional: numpy.dtype, default value is None
Returns
-------
Result of type CSRNDArray or RowSparseNDArray
Examples
--------
Below is an example of the powerlaw distribution with csr as the stype.
It calculates the nnz using the shape and density.
It fills up the ndarray with exponentially increasing number of elements.
If there are enough unused_nnzs, n+1th row will have twice more nnzs compared to nth row.
else, remaining unused_nnzs will be used in n+1th row
If number of cols is too small and we have already reached column size it will fill up
all following columns in all followings rows until we reach the required density.
>>> csr_arr, _ = rand_sparse_ndarray(shape=(5, 16), stype="csr",
density=0.50, distribution="powerlaw")
>>> indptr = csr_arr.indptr.asnumpy()
>>> indices = csr_arr.indices.asnumpy()
>>> data = csr_arr.data.asnumpy()
>>> row2nnz = len(data[indptr[1]:indptr[2]])
>>> row3nnz = len(data[indptr[2]:indptr[3]])
>>> assert(row3nnz == 2*row2nnz)
>>> row4nnz = len(data[indptr[3]:indptr[4]])
>>> assert(row4nnz == 2*row3nnz)
"""
density = rnd.rand() if density is None else density
dtype = default_dtype() if dtype is None else dtype
if stype == 'row_sparse':
assert (distribution == "uniform"), \
"Distribution %s not supported for row_sparse" % (distribution)
# sample index
idx_sample = rnd.rand(shape[0])
indices = np.argwhere(idx_sample < density).flatten()
Expand All @@ -112,10 +231,15 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None):
arr = mx.nd.row_sparse_array(val, indices, shape, indices_type=np.int64, dtype=dtype)
return arr, (val, indices)
elif stype == 'csr':
assert(len(shape) == 2)
csr = sp.rand(shape[0], shape[1], density=density, format='csr', dtype=dtype)
result = mx.nd.csr_matrix(csr.data, csr.indptr, csr.indices, shape, dtype=dtype)
return result, (csr.indptr, csr.indices, csr.data)
assert len(shape) == 2
if distribution == "uniform":
csr = _get_uniform_dataset_csr(shape[0], shape[1], density, dtype=dtype)
return csr, (csr.indptr, csr.indices, csr.data)
elif distribution == "powerlaw":
csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density, dtype=dtype)
return csr, (csr.indptr, csr.indices, csr.data)
else:
assert(False), "Distribution not supported: %s" % (distribution)
else:
assert(False), "unknown storage type"

Expand Down
33 changes: 33 additions & 0 deletions tests/python/unittest/test_sparse_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,39 @@ def test_sparse_nd_empty():
assert(nd.stype == stype)


def test_synthetic_dataset_generator():
def test_powerlaw_generator(csr_arr, final_row=1):
"""Test power law distribution
Total Elements: 32000, Number of zeros: 3200
Every row has 2 * non zero elements of the previous row.
Also since (2047 < 3200 < 4095) this will be true till 10th row"""
indices = csr_arr.indices.asnumpy()
indptr = csr_arr.indptr.asnumpy()
for row in range(1, final_row + 1):
nextrow = row + 1
current_row_nnz = indices[indptr[row] - 1] + 1
next_row_nnz = indices[indptr[nextrow] - 1] + 1
assert next_row_nnz == 2 * current_row_nnz

# Test if density is preserved
csr_arr_cols, _ = rand_sparse_ndarray(shape=(32, 10000), stype="csr",
density=0.01, distribution="powerlaw")

csr_arr_small, _ = rand_sparse_ndarray(shape=(5, 5), stype="csr",
density=0.5, distribution="powerlaw")

csr_arr_big, _ = rand_sparse_ndarray(shape=(32, 1000000), stype="csr",
density=0.4, distribution="powerlaw")

csr_arr_square, _ = rand_sparse_ndarray(shape=(1600, 1600), stype="csr",
density=0.5, distribution="powerlaw")
assert len(csr_arr_cols.data) == 3200
test_powerlaw_generator(csr_arr_cols, final_row=9)
test_powerlaw_generator(csr_arr_small, final_row=1)
test_powerlaw_generator(csr_arr_big, final_row=4)
test_powerlaw_generator(csr_arr_square, final_row=6)


if __name__ == '__main__':
import nose
nose.runmodule()

0 comments on commit 2d93d72

Please sign in to comment.