Skip to content

Commit

Permalink
Merge pull request #18 from kellieotto/update-details
Browse files Browse the repository at this point in the history
Update details
  • Loading branch information
kellieotto authored Sep 13, 2018
2 parents 604545b + 7d826e8 commit 58b6af4
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 95 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ The prototype generator is built on SHA-256.
## Installation from binaries

```
$ pip install git+git://github.com/statlab/cryptorandom.git
$ pip install cryptorandom
```

## Installation from source
Expand Down
117 changes: 32 additions & 85 deletions cryptorandom/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,19 @@ def random_sample(a, size, replace=False, p=None, method="sample_by_index", prng
If weights are provided, len(weights) must equal N.
Sampling methods available are:
Fisher-Yates: sampling without weights, without replacement
PIKK: sampling without weights, without replacement
recursive: samping without weights, without replacement
Waterman_R: sampling without weights, without replacement
Vitter_Z: sampling without weights, without replacement
sample_by_index: sampling without weights, without replacement
Exponential: sampling with weights, without replacement
Elimination: sampling with weights, without replacement
...
* Fisher-Yates: sampling without weights, without replacement
* PIKK: sampling without weights, without replacement
* recursive: samping without weights, without replacement
* Waterman_R: sampling without weights, without replacement
* Vitter_Z: sampling without weights, without replacement
* sample_by_index: sampling without weights, without replacement
* Exponential: sampling with weights, without replacement
* Elimination: sampling with weights, without replacement
Fisher-Yates, PIKK, sample_by_index, Exponential, and Elimination return ordered samples,
i.e. they are equally likely to return [1, 2] as they are to return [2, 1]. Waterman_R,
Vitter_Z, and recursive aren't guaranteed to randomize the order of items in the sample.
Parameters
----------
a : 1-D array-like or int
Expand Down Expand Up @@ -126,9 +128,9 @@ def random_permutation(a, method="Fisher-Yates", prng=None):
Construct a random permutation (re-ordering) of a population `a`.
The algorithms available are:
Fisher-Yates: a shuffling algorithm
random_sort: generate random floats and sort
permute_by_index: sample integer indices without replacement
* Fisher-Yates: a shuffling algorithm
* random_sort: generate random floats and sort
* permute_by_index: sample integer indices without replacement
Parameters
----------
Expand Down Expand Up @@ -158,9 +160,9 @@ def random_permutation(a, method="Fisher-Yates", prng=None):
raise ValueError("a must be an integer or array-like")

methods = {
"Fisher-Yates" : lambda N: fykd_shuffle(N, prng=prng),
"random_sort" : lambda N: pikk_shuffle(N, prng=prng),
"permute_by_index" : lambda N: permute_by_index(N, prng=prng),
"Fisher-Yates" : lambda N: fykd_sample(N, N, prng=prng),
"random_sort" : lambda N: pikk(N, N, prng=prng),
"permute_by_index" : lambda N: sample_by_index(N, N, prng=prng),
}

try:
Expand Down Expand Up @@ -263,7 +265,7 @@ def recursive_sample(n, k, prng=None):

def waterman_r(n, k, prng=None):
'''
Waterman's Algorithm R for resevoir SRSs
Waterman's Algorithm R for reservoir SRSs
Draw a sample of to sample k out of 1, ..., n without replacement
Parameters
Expand Down Expand Up @@ -291,7 +293,7 @@ def waterman_r(n, k, prng=None):

def vitter_z(n, k, prng=None):
'''
Vitter's Algorithm Z for resevoir SRSs (Vitter 1985).
Vitter's Algorithm Z for reservoir SRSs (Vitter 1985).
Draw a sample of to sample k out of 1, ..., n without replacement
Parameters
Expand All @@ -313,10 +315,15 @@ def vitter_z(n, k, prng=None):
def Algorithm_X(n, t):
V = prng.random()
s = 0
frac = 2
numer = math.factorial(t+s+1-n)/math.factorial(t-n)
denom = math.factorial(t+s+1)/math.factorial(t)
frac = numer/denom

while frac > V:
s += 1
frac = ((t+1-n)/(t+1))**(s+1)
numer = (t+s+1-n)*numer
denom = (t+s+1)*denom
frac = numer/denom
return s

def f(x, t):
Expand All @@ -338,7 +345,7 @@ def c(t):
sam = np.array(range(1, k+1)) # fill the reservoir
t = k

while t <= n:
while t < n:
# Determine how many unseen records, nu, to skip
if t <= 22*k: # the choice of 22 is taken from Vitter's 1985 ACM paper
nu = Algorithm_X(k, t)
Expand All @@ -353,10 +360,10 @@ def c(t):
break
var = f(np.floor(X), t)/(c(t)*g(X, t))
nu = np.floor(X)
if t+nu <= n:
if t+nu < n:
# Make the next record a candidate, replacing one at random
i = prng.randint(0, k)
sam[i] = int(t+nu)
sam[i] = int(t+nu+1)
t = t+nu+1
return sam

Expand Down Expand Up @@ -496,67 +503,7 @@ def exponential_sample(k, p, prng=None):
elif k == n:
return np.array(range(k))
else:
sam = prng.random(size=n)
sam = np.array(prng.random(size=n), dtype=float)
sam = -np.log(sam)/weights
sample = sam.argsort()[0:k]
return sample+1

######################## Permutation functions #################################

def fykd_shuffle(n, prng=None):
'''
Use Fisher-Yates-Knuth-Durstenfeld algorithm to permute 1, ..., n
Parameters
----------
n : int
Population size
prng : {None, int, object}
If prng is None, return a randomly seeded instance of SHA256.
If prng is an int, return a new SHA256 instance seeded with seed.
If prng is already a PRNG instance, return it.
Returns
-------
permuted list of {1, ..., n}
'''
return fykd_sample(n, n, prng=prng)


def pikk_shuffle(n, prng=None):
'''
Assign random values between 0 and 1 to the numbers 1, ..., n and sort them
according to these random values.
Parameters
----------
n : int
Population size
prng : {None, int, object}
If prng is None, return a randomly seeded instance of SHA256.
If prng is an int, return a new SHA256 instance seeded with seed.
If prng is already a PRNG instance, return it.
Returns
-------
list of items sampled
'''
prng = get_prng(prng)
return np.argsort(prng.random(n)) + 1


def permute_by_index(n, prng=None):
'''
Select indices uniformly at random, without replacement, to permute 1, ..., n
Parameters
----------
n : int
Population size
prng : {None, int, object}
If prng is None, return a randomly seeded instance of SHA256.
If prng is an int, return a new SHA256 instance seeded with seed.
If prng is already a PRNG instance, return it.
Returns
-------
list of items sampled
'''
return sample_by_index(n, n, prng=prng)
14 changes: 7 additions & 7 deletions cryptorandom/tests/test_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,19 +207,19 @@ def test_vitter_z():
"""
ff = fake_generator()
sam = vitter_z(5, 2, prng=ff)
assert (sam == [4, 2]).all()
assert (sam == [5, 2]).all()

ff = fake_generator()
sam = random_sample(5, 2, method="Vitter_Z", prng=ff)
assert (sam+1 == [4, 2]).all() # shift to 1-index
assert (sam+1 == [5, 2]).all() # shift to 1-index

ff = fake_generator()
sam = vitter_z(500, 2, prng=ff)
assert (sam == [420, 265]).all()
assert (sam == [472, 422]).all()

ff = fake_generator()
sam = random_sample(500, 2, method="Vitter_Z", prng=ff)
assert (sam+1 == [420, 265]).all() # shift to 1-index
assert (sam+1 == [472, 422]).all() # shift to 1-index


def test_elimination_sample():
Expand Down Expand Up @@ -261,7 +261,7 @@ def test_fykd_shuffle():
Test Fisher-Yates shuffle for random permutations, fykd_shuffle
"""
ff = fake_generator()
sam = fykd_shuffle(5, prng=ff)
sam = fykd_sample(5, 5, prng=ff)
assert (sam == [1, 2, 3, 4, 5]).all()

ff = fake_generator()
Expand All @@ -279,7 +279,7 @@ def test_pikk_shuffle():
Test PIKK shuffling
"""
ff = fake_generator()
sam = pikk_shuffle(5, prng=ff)
sam = pikk(5, 5, prng=ff)
assert (sam == [1, 2, 3, 4, 5]).all()

ff = fake_generator()
Expand All @@ -292,7 +292,7 @@ def test_permute_by_index():
Test permuting by index shuffling
"""
ff = fake_generator()
sam = permute_by_index(5, prng=ff)
sam = sample_by_index(5, 5, prng=ff)
assert (sam == [2, 3, 1, 4, 5]).all()

ff = fake_generator()
Expand Down
64 changes: 64 additions & 0 deletions docs/examples/sample.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,68 @@ Elimination yes without replacement
10000 loops, best of 3: 22 µs per loop
>>> %timeit random_sample(fruit, 2, method="sample_by_index", prng=s)
100000 loops, best of 3: 15 µs per loop
Some sampling methods (Fisher-Yates, PIKK, sample_by_index, Exponential, and Elimination) return ordered samples, i.e. they are equally likely to return [1, 2] as they are to return [2, 1].

.. code::
>>> s = SHA256(1234567890)
>>> counts = {}
>>> for i in range(10000):
>>> sam = pikk(5, 2, prng=s)
>>> if str(sam) in counts.keys():
>>> counts[str(sam)]+=1
>>> else:
>>> counts[str(sam)]=0
>>> counts
{'[1 2]': 549,
'[1 3]': 528,
'[1 4]': 512,
'[1 5]': 502,
'[2 1]': 515,
'[2 3]': 485,
'[2 4]': 487,
'[2 5]': 482,
'[3 1]': 484,
'[3 2]': 482,
'[3 4]': 466,
'[3 5]': 525,
'[4 1]': 468,
'[4 2]': 512,
'[4 3]': 490,
'[4 5]': 490,
'[5 1]': 547,
'[5 2]': 460,
'[5 3]': 507,
'[5 4]': 489}
The reservoir algorithms (Waterman_R and Vitter_Z) and the recursive method aren't guaranteed to randomize the order of sampled items.

.. code::
>>> s = SHA256(1234567890)
>>> counts = {}
>>> for i in range(10000):
>>> sam = recursive_sample(5, 2, prng=s)
>>> if str(sam) in counts.keys():
>>> counts[str(sam)]+=1
>>> else:
>>> counts[str(sam)]=0
>>> counts
{'[1 2]': 492,
'[1 3]': 499,
'[1 4]': 503,
'[1 5]': 1016,
'[2 1]': 462,
'[2 3]': 487,
'[2 4]': 525,
'[2 5]': 985,
'[3 1]': 481,
'[3 2]': 485,
'[3 4]': 507,
'[3 5]': 984,
'[4 1]': 524,
'[4 2]': 475,
'[4 3]': 516,
'[4 5]': 1043}
6 changes: 5 additions & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ Welcome to cryptorandom's documentation!
`cryptorandom` is a package random sampling and random number generation using
cryptographically secure pseudorandom number generators.

`Download the package here!`__
`Download the package on Github`__ or install it from PyPi:

.. code::
pip install cryptorandom
.. __: https://github.com/statlab/cryptorandom

Expand Down
28 changes: 28 additions & 0 deletions docs/release/release_0.2.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Announcement: cryptorandom 0.2
===========================

We're happy to announce the release of cryptorandom v0.2!

cryptorandom is a cryptographically secure PRNG and sampling module for Python.

For more information, examples, and documentation, please visit our website:

http://statlab.github.io/cryptorandom/


New Features
------------
* Functionality to generate random permutations of a list



Improvements
------------
* Bug fix in Vitter_Z
* Standardized output of sampling functions. Now, all of them return np.arrays
* Improved examples and documentation


Contributors to this release
----------------------------
Kellie Ottoboni
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
URL = 'http://www.github.com/statlab/cryptorandom'
LICENSE = 'BSD License'
DOWNLOAD_URL = 'http://www.github.com/statlab/cryptorandom'
VERSION = '0.1'
VERSION = '0.2'
PYTHON_VERSION = (2, 7)

INSTALL_REQUIRES = [
Expand Down

0 comments on commit 58b6af4

Please sign in to comment.