Add get_synthetic_dataset function to util (apache#146)

* Add get_synthetic_datasets * Move to test_utils * Remove _get_uniform_dataset * Move validation to its own function * Refactor the validation code for csr generation * Make test_powerlaw a nested function * Change SparseNDArray to CSRNDArray * Merge with dtype specific changes in test_utils
cjolivier01 · Aug 10, 2017 · 2d93d72 · 2d93d72
1 parent 8040953
commit 2d93d72
Show file tree

Hide file tree

Showing 2 changed files with 163 additions and 6 deletions.
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
@@ -96,11 +96,130 @@ def random_sample(population, k):
     return population_copy[0:k]
 
 
-def rand_sparse_ndarray(shape, stype, density=None, dtype=None):
-    """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np) """
+def _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="uniform"):
+    """Validates inputs for csr generation helper functions
+    """
+    total_nnz = int(num_rows * num_cols * density)
+    if density < 0 or density > 1:
+        raise ValueError("density has to be between 0 and 1")
+
+    if num_rows <= 0 or num_cols <= 0:
+        raise ValueError("num_rows or num_cols should be greater than 0")
+
+    if distribution == "powerlaw":
+        if total_nnz < 2 * num_rows:
+            raise ValueError("not supported for this density: %s"
+                             " for this shape (%s, %s)"
+                             " Please keep :"
+                             " num_rows * num_cols * density >= 2 * num_rows"
+                             % (density, num_rows, num_cols))
+
+
+def _get_uniform_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
+    """Returns CSRNDArray with uniform distribution
+    This generates a csr matrix with totalnnz unique randomly chosen numbers
+    from num_rows*num_cols and arranges them in the 2d array in the
+    following way: row_index = (random_number_generated / num_rows)
+    col_index = random_number_generated - row_index * num_cols
+    """
+    _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="uniform")
+    csr = sp.rand(num_rows, num_cols, density, dtype=dtype, format="csr")
+    result = mx.nd.csr_matrix(csr.data, csr.indptr, csr.indices,
+                              (num_rows, num_cols), dtype=dtype)
+    return result
+
+
+def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
+    """Returns CSRNDArray with powerlaw distribution
+    with exponentially increasing number of non zeros in each row.
+    Not supported for cases where total_nnz < 2*num_rows. This is because
+    the algorithm first tries to ensure that there are rows with no zeros by
+    putting non zeros at beginning of each row.
+    """
+
+    _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="powerlaw")
+
+    total_nnz = int(num_rows * num_cols * density)
+
+    unused_nnz = total_nnz
+    output_arr = np.zeros((num_rows, num_cols), dtype=dtype)
+    # Start with ones on each row so that no row is empty
+    for row in range(num_rows):
+        output_arr[row][0] = 1 + rnd.uniform(0.001, 2)
+        unused_nnz = unused_nnz - 1
+        if unused_nnz <= 0:
+            return mx.nd.array(output_arr).tostype("csr")
+
+    # Populate rest of matrix with 2^i items in ith row.
+    # if we have used all total nnz return the sparse matrix
+    # else if we reached max column size then fill up full columns until we use all nnz
+    col_max = 2
+    for row in range(num_rows):
+        col_limit = min(num_cols, col_max)
+        # In case col_limit reached assign same value to all elements, which is much faster
+        if col_limit == num_cols and unused_nnz > col_limit:
+            output_arr[row] = 1 + rnd.uniform(0.001, 2)
+            unused_nnz = unused_nnz - col_limit + 1
+            if unused_nnz <= 0:
+                return mx.nd.array(output_arr).tostype("csr")
+            else:
+                continue
+        for col_index in range(1, col_limit):
+            output_arr[row][col_index] = 1 + rnd.uniform(0.001, 2)
+            unused_nnz = unused_nnz - 1
+            if unused_nnz <= 0:
+                return mx.nd.array(output_arr).tostype("csr")
+        col_max = col_max * 2
+
+    if unused_nnz > 0:
+        #return mx.nd.array(sp.random(num_rows, num_cols, density).toarray()).tostype("csr")
+        raise ValueError("not supported for this density: %s"
+                         " for this shape (%s,%s)" % (density, num_rows, num_cols))
+    else:
+        return mx.nd.array(output_arr).tostype("csr")
+
+
+def rand_sparse_ndarray(shape, stype, density=None, distribution="uniform", dtype=None):
+    """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np)
+    Parameters
+    ----------
+    shape: list or tuple
+    stype: str, valid values: "csr" or "row_sparse"
+    density, optional: float, should be between 0 and 1
+    distribution, optional: str, valid values: "uniform" or "powerlaw"
+    dtype, optional: numpy.dtype, default value is None
+    Returns
+    -------
+    Result of type CSRNDArray or RowSparseNDArray
+    Examples
+    --------
+    Below is an example of the powerlaw distribution with csr as the stype.
+    It calculates the nnz using the shape and density.
+    It fills up the ndarray with exponentially increasing number of elements.
+    If there are enough unused_nnzs, n+1th row will have twice more nnzs compared to nth row.
+    else, remaining unused_nnzs will be used in n+1th row
+    If number of cols is too small and we have already reached column size it will fill up
+    all following columns in all followings rows until we reach the required density.
+
+    >>> csr_arr, _ = rand_sparse_ndarray(shape=(5, 16), stype="csr",
+                                         density=0.50, distribution="powerlaw")
+    >>> indptr = csr_arr.indptr.asnumpy()
+    >>> indices = csr_arr.indices.asnumpy()
+    >>> data = csr_arr.data.asnumpy()
+    >>> row2nnz = len(data[indptr[1]:indptr[2]])
+    >>> row3nnz = len(data[indptr[2]:indptr[3]])
+    >>> assert(row3nnz == 2*row2nnz)
+    >>> row4nnz = len(data[indptr[3]:indptr[4]])
+    >>> assert(row4nnz == 2*row3nnz)
+    """
     density = rnd.rand() if density is None else density
     dtype = default_dtype() if dtype is None else dtype
     if stype == 'row_sparse':
+        assert (distribution == "uniform"), \
+               "Distribution %s not supported for row_sparse" % (distribution)
         # sample index
         idx_sample = rnd.rand(shape[0])
         indices = np.argwhere(idx_sample < density).flatten()
@@ -112,10 +231,15 @@ def rand_sparse_ndarray(shape, stype, density=None, dtype=None):
         arr = mx.nd.row_sparse_array(val, indices, shape, indices_type=np.int64, dtype=dtype)
         return arr, (val, indices)
     elif stype == 'csr':
-        assert(len(shape) == 2)
-        csr = sp.rand(shape[0], shape[1], density=density, format='csr', dtype=dtype)
-        result = mx.nd.csr_matrix(csr.data, csr.indptr, csr.indices, shape, dtype=dtype)
-        return result, (csr.indptr, csr.indices, csr.data)
+        assert len(shape) == 2
+        if distribution == "uniform":
+            csr = _get_uniform_dataset_csr(shape[0], shape[1], density, dtype=dtype)
+            return csr, (csr.indptr, csr.indices, csr.data)
+        elif distribution == "powerlaw":
+            csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density, dtype=dtype)
+            return csr, (csr.indptr, csr.indices, csr.data)
+        else:
+            assert(False), "Distribution not supported: %s" % (distribution)
     else:
         assert(False), "unknown storage type"
 

diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
@@ -467,6 +467,39 @@ def test_sparse_nd_empty():
         assert(nd.stype == stype)
 
 
+def test_synthetic_dataset_generator():
+    def test_powerlaw_generator(csr_arr, final_row=1):
+        """Test power law distribution
+        Total Elements: 32000, Number of zeros: 3200
+        Every row has 2 * non zero elements of the previous row.
+        Also since (2047 < 3200 < 4095) this will be true till 10th row"""
+        indices = csr_arr.indices.asnumpy()
+        indptr = csr_arr.indptr.asnumpy()
+        for row in range(1, final_row + 1):
+            nextrow = row + 1
+            current_row_nnz = indices[indptr[row] - 1] + 1
+            next_row_nnz = indices[indptr[nextrow] - 1] + 1
+            assert next_row_nnz == 2 * current_row_nnz
+
+    # Test if density is preserved
+    csr_arr_cols, _ = rand_sparse_ndarray(shape=(32, 10000), stype="csr",
+                                          density=0.01, distribution="powerlaw")
+
+    csr_arr_small, _ = rand_sparse_ndarray(shape=(5, 5), stype="csr",
+                                           density=0.5, distribution="powerlaw")
+
+    csr_arr_big, _ = rand_sparse_ndarray(shape=(32, 1000000), stype="csr",
+                                         density=0.4, distribution="powerlaw")
+
+    csr_arr_square, _ = rand_sparse_ndarray(shape=(1600, 1600), stype="csr",
+                                            density=0.5, distribution="powerlaw")
+    assert len(csr_arr_cols.data) == 3200
+    test_powerlaw_generator(csr_arr_cols, final_row=9)
+    test_powerlaw_generator(csr_arr_small, final_row=1)
+    test_powerlaw_generator(csr_arr_big, final_row=4)
+    test_powerlaw_generator(csr_arr_square, final_row=6)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()