add random sampling function

Finschia · Mar 30, 2020 · a428c18 · a428c18
1 parent 55b8f34
commit a428c18
Show file tree

Hide file tree

Showing 2 changed files with 215 additions and 0 deletions.
diff --git a/libs/rand/sampling.go b/libs/rand/sampling.go
@@ -0,0 +1,102 @@
+package rand
+
+import (
+	"fmt"
+	s "sort"
+)
+
+// Interface for performing weighted deterministic random selection.
+type Candidate interface {
+	Priority() uint64
+	LessThan(other *Candidate) bool
+}
+
+const uint64Mask = uint64(0x7FFFFFFFFFFFFFFF)
+
+// Select a specified number of candidates randomly from the candidate set based on each priority. This function is
+// deterministic and will produce the same result for the same input.
+//
+// Inputs:
+// seed - 64bit integer used for random selection.
+// candidates - A set of candidates. The order is disregarded.
+// sampleSize - The number of candidates to select at random.
+// totalPriority - The exact sum of the priorities of each candidate.
+//
+// Returns:
+// samples - A randomly selected candidate from a set of candidates. NOTE that the same candidate may have been
+// selected in duplicate.
+func RandomSamplingWithPriority(seed uint64, candidates []Candidate, sampleSize int, totalPriority uint64) (samples []Candidate) {
+
+	// generates a random selection threshold for candidates' cumulative priority
+	thresholds := make([]uint64, sampleSize)
+	for i := 0; i < sampleSize; i++ {
+		// calculating [gross weights] × [(0,1] random number]
+		thresholds[i] = uint64(float64(nextRandom(&seed)&uint64Mask) / float64(uint64Mask+1) * float64(totalPriority))
+	}
+	s.Slice(thresholds, func(i, j int) bool { return thresholds[i] < thresholds[j] })
+
+	// generates a copy of the set to keep the given array order
+	candidates = sort(candidates)
+
+	// extract candidates with a cumulative priority threshold
+	samples = make([]Candidate, sampleSize)
+	cumulativePriority := uint64(0)
+	undrawn := 0
+	for _, candidate := range candidates {
+		for thresholds[undrawn] < cumulativePriority+candidate.Priority() {
+			samples[undrawn] = candidate
+			undrawn++
+			if undrawn == len(samples) {
+				return
+			}
+		}
+		cumulativePriority += candidate.Priority()
+	}
+
+	// Possible factors: 1) the given total priority is less than the actual cumulative on, 2) the given candidates is
+	// an empty set, or 3) a bug.
+	actualTotalPriority := uint64(0)
+	for i := 0; i < len(candidates); i++ {
+		actualTotalPriority += candidates[i].Priority()
+	}
+	msg := fmt.Sprintf("totalPriority=%d, actualTotalPriority=%d,"+
+		" seed=%d, sampleSize=%d, undrawn=%d, threshold[%d]=%d",
+		actualTotalPriority, totalPriority, seed, sampleSize, undrawn, undrawn, thresholds[undrawn])
+	if len(candidates) == 0 {
+		msg = fmt.Sprintf("The given candidate is an empty set: %s", msg)
+	} else if totalPriority < actualTotalPriority {
+		msg = fmt.Sprintf("The given total priority %d is less than the actual one %d, or a bug: %s",
+			totalPriority, actualTotalPriority, msg)
+	}
+	panic(msg)
+}
+
+// SplitMix64
+// http://xoshiro.di.unimi.it/splitmix64.c
+//
+// The PRNG used for this random selection:
+//   1. must be deterministic.
+//   2. should easily portable, independent of language or library
+//   3. is not necessary to keep a long period like MT, since there aren't many random numbers to generate and
+//      we expect a certain amount of randomness in the seed.
+func nextRandom(rand *uint64) uint64 {
+	*rand += uint64(0x9e3779b97f4a7c15)
+	var z = *rand
+	z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9
+	z = (z ^ (z >> 27)) * 0x94d049bb133111eb
+	return z ^ (z >> 31)
+}
+
+// sort candidates in descending priority and ascending nature order
+func sort(candidates []Candidate) []Candidate {
+	temp := make([]Candidate, len(candidates))
+	copy(temp, candidates)
+	s.Slice(temp, func(i, j int) bool {
+		if temp[i].Priority() != temp[j].Priority() {
+			return temp[i].Priority() > temp[j].Priority()
+		} else {
+			return temp[i].LessThan(&temp[j])
+		}
+	})
+	return temp
+}
diff --git a/libs/rand/sampling_test.go b/libs/rand/sampling_test.go
@@ -0,0 +1,113 @@
+package rand
+
+import (
+	"fmt"
+	"math"
+	s "sort"
+	"testing"
+)
+
+type Element struct {
+	Id     uint32
+	Weight uint64
+}
+
+func (e *Element) Priority() uint64 {
+	return e.Weight
+}
+
+func (e *Element) LessThan(other *Candidate) bool {
+	o, ok := (*other).(*Element)
+	if ! ok {
+		panic("incompatible type")
+	}
+	return e.Id < o.Id
+}
+
+func TestRandomSamplingWithPriority(t *testing.T) {
+	candidates := newCandidates(100, func(i int) uint64 { return uint64(i) })
+
+	elected := RandomSamplingWithPriority(0, candidates, 10, uint64(len(candidates)))
+	if len(elected) != 10 {
+		t.Errorf(fmt.Sprintf("unexpected sample size: %d", len(elected)))
+	}
+
+	// ----
+	// The same result can be obtained for the same input.
+	others := newCandidates(100, func(i int) uint64 { return uint64(i) })
+	secondTimeElected := RandomSamplingWithPriority(0, others, 10, uint64(len(others)))
+	if len(elected) != len(secondTimeElected) || !sameCandidates(elected, secondTimeElected) {
+		t.Errorf(fmt.Sprintf("undeterministic: %+v != %+v", elected, others))
+	}
+
+	// ----
+	// Make sure the winning frequency will be even
+	candidates = newCandidates(100, func(i int) uint64 { return 1 })
+	counts := make([]int, len(candidates))
+	for i := 0; i < 100000; i++ {
+		elected = RandomSamplingWithPriority(uint64(i), candidates, 10, uint64(len(candidates)))
+		for _, e := range elected {
+			counts[e.(*Element).Id] += 1
+		}
+	}
+	expected := float64(1) / float64(100)
+	mean, variance, z := calculateZ(expected, counts)
+	if z >= 1e-15 || math.Abs(mean-expected) >= 1e-15 || variance >= 1e-5 {
+		t.Errorf("winning frequency is uneven: mean=%f, variance=%e, z=%e", mean, variance, z)
+	}
+}
+
+
+func newCandidates(length int, prio func(int) uint64) (candidates []Candidate) {
+	candidates = make([]Candidate, 100)
+	for i := 0; i < length; i++ {
+		candidates[i] = &Element{uint32(i), prio(i)}
+	}
+	return
+}
+
+func sameCandidates(c1 []Candidate, c2 []Candidate) bool {
+	if len(c1) != len(c2) {
+		return false
+	}
+	s.Slice(c1, func(i, j int) bool { return c1[i].LessThan(&c1[j]) })
+	s.Slice(c2, func(i, j int) bool { return c2[i].LessThan(&c2[j]) })
+	for i := 0; i < len(c1); i++ {
+		if c1[i].(*Element).Id != c2[i].(*Element).Id {
+			return false
+		}
+	}
+	return true
+}
+
+// The cumulative VotingPowers should follow a normal distribution with a mean as the expected value.
+// A risk factor will be able to acquire from the value using a standard normal distribution table by
+// applying the transformation to normalize to the expected value.
+func calculateZ(expected float64, values []int) (mean, variance, z float64) {
+	sum := 0.0
+	for i := 0; i < len(values); i++ {
+		sum += float64(values[i])
+	}
+	actuals := make([]float64, len(values))
+	for i := 0; i < len(values); i++ {
+		actuals[i] = float64(values[i]) / sum
+	}
+	mean, variance = calculateMeanAndVariance(actuals)
+	z = (mean - expected) / math.Sqrt(variance/float64(len(values)))
+	return
+}
+
+func calculateMeanAndVariance(values []float64) (mean float64, variance float64) {
+	sum := 0.0
+	for _, x := range values {
+		sum += float64(x)
+	}
+	mean = float64(sum) / float64(len(values))
+	sum2 := 0.0
+	for _, x := range values {
+		dx := float64(x) - mean
+		sum2 += dx * dx
+	}
+	variance = sum2 / float64(len(values))
+	return
+}