diff --git a/examples/basic_example.py b/examples/basic_example.py index b060bd8..9bc4bfe 100755 --- a/examples/basic_example.py +++ b/examples/basic_example.py @@ -1,4 +1,9 @@ #!/usr/bin/env python +''' +Basic example which creates a dummy test dataset and a +benchmark collection. This example 'trains' on the benchmark +set and displays the results of the comparison. +''' import numpy @@ -9,8 +14,8 @@ 'Uniform': numpy.random.uniform(size=100) } -n_benchmark_collections = 5 -benchmark_labels = ['Benchmark_%d' % i for i in range(n_benchmark_collections)] +N_BENCHMARK_COLLECTIONS = 5 +benchmark_labels = ['Benchmark_%d' % i for i in range(N_BENCHMARK_COLLECTIONS)] benchmark_data = { benchmark_label:{'Gaussian': numpy.random.normal(size=100), 'Uniform': numpy.random.uniform(size=100)} @@ -18,7 +23,8 @@ } # histogram the data -test_histograms = {name:numpy.histogram(data)[0] for name, data in test_data.items()} +test_histograms = {name:numpy.histogram(data)[0] + for name, data in test_data.items()} benchmark_histograms = dict() for name, bm_data in benchmark_data.items(): benchmark_histograms[name] = {n:numpy.histogram(data)[0] diff --git a/examples/classic_fit_example.py b/examples/classic_fit_example.py index d8b20e1..14ba7db 100755 --- a/examples/classic_fit_example.py +++ b/examples/classic_fit_example.py @@ -1,5 +1,9 @@ #!/usr/bin/env python +''' +Example illustrating the classic method, meaning fitting by hand. +''' + import numpy import pylab import scipy.optimize @@ -16,8 +20,11 @@ data.append(time) def gauss(x, *p): - A, mu, sigma = p - return A*numpy.exp(-(x-mu)**2/(2.*sigma**2)) + ''' + The gaussian distribution. + ''' + amplitude, mean, sigma = p + return amplitude*numpy.exp(-(x-mean)**2/(2.*sigma**2)) hist, bin_edges = numpy.histogram(data) bin_centers = (bin_edges[:-1] + bin_edges[1:])/2 @@ -25,16 +32,20 @@ def gauss(x, *p): p1 = [1000., 0., 1.] print(hist) print(bin_centers) -coeff, var_matrix = scipy.optimize.curve_fit(gauss, bin_centers, hist, p0=p1) -print('Fitted amplitude = %f' % coeff[0]) -print('Fitted mean = %f' % coeff[1]) -print('Fitted standard deviation = %f' % coeff[2]) +result = scipy.optimize.curve_fit(gauss, bin_centers, hist, p0=p1) +fitted_amplitude = result[0] +fitted_mean = result[1] +fitted_std_dev = result[2] + +print('Fitted amplitude = %f' % fitted_amplitude) +print('Fitted mean = %f' % fitted_mean) +print('Fitted standard deviation = %f' % fitted_std_dev) +coeff = tuple(result[:3]) hist_fit = gauss(bin_centers, *coeff) pylab.hist(data) pylab.plot(bin_centers, gauss(bin_centers, *p0), label='Initial') pylab.plot(bin_centers, hist_fit, label='Fitted data') pylab.show() - diff --git a/examples/gaussian_example.py b/examples/gaussian_example.py deleted file mode 100755 index 4d81c4f..0000000 --- a/examples/gaussian_example.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python - -import collections - -import numpy -import pylab -import scipy.optimize -import scipy.stats - -import voka.metrics.chisq - -# This example illustrates the failure of traditional statistical -# comparisons, such as chi^2, for weighted histograms from -# stochastic processes. Can't rely on the p-value calculated -# from standard tools such as scipy even in this simple example. - -# Histogram the arrival time, expected to be gaussian, of the charge -# sampled from a gaussian. - -histograms = dict() -n_histograms = 1000 -bins=50 -size=1000 -for i in range(n_histograms): - data = numpy.random.uniform(low=-5, high=5, size=size) - histograms['Uniform%d' % i] = numpy.histogram(data, bins=bins) - - data = numpy.random.chisquare(df=5, size=size) - histograms['ChiSq5%d' % i] = numpy.histogram(data, bins=bins) - - data = numpy.random.normal(size=size) - histograms['Gaussian%d' % i] = numpy.histogram(data, bins=bins) - - data = numpy.random.normal(size=int(size/2), loc=-1, scale = 0.25) + numpy.random.normal(size=int(size/2), loc=1, scale = 0.25) - histograms['BiGaussian%d' % i] = numpy.histogram(data, bins=bins) - - data = numpy.random.exponential(scale=10, size=1000) - histograms['Exponential%d' % i] = numpy.histogram(data, bins=bins) - -T_dist = collections.defaultdict(list) -test_stat = voka.metrics.chisq.NormChiSq() -for i in range(n_histograms): - for j in range(i+1, n_histograms): - h1 = histograms["Uniform%d" % i][0] - h2 = histograms["Uniform%d" % j][0] - T_dist['uniform'].append(test_stat(h1, h2)) - - h1 = histograms["ChiSq5%d" % i][0] - h2 = histograms["ChiSq5%d" % j][0] - T_dist['chisq'].append(test_stat(h1, h2)) - - h1 = histograms["Exponential%d" % i][0] - h2 = histograms["Exponential%d" % j][0] - T_dist['exponential'].append(test_stat(h1, h2)) - - h1 = histograms["Gaussian%d" % i][0] - h2 = histograms["Gaussian%d" % j][0] - T_dist['gaussian'].append(test_stat(h1, h2)) - - h1 = histograms["BiGaussian%d" % i][0] - h2 = histograms["BiGaussian%d" % j][0] - T_dist['bigaussian'].append(test_stat(h1, h2)) - -pylab.figure(1) -pylab.subplot(231) -pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Uniform') -pylab.xlabel(r'$\chi^{2}$') -pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['uniform'], density=True, range=(0,200), bins=100) -x = range(0, int(bin_edges[-1])) -for ndof in range(10, 60, 10): - rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) -pylab.legend() - -pylab.subplot(232) -pylab.title(r'$\chi^{2}$ Test Statistic Distribution - ChiSq') -pylab.xlabel(r'$\chi^{2}$') -pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['chisq'], density=True, range=(0,200), bins=100) -x = range(0, int(bin_edges[-1])) -for ndof in range(10, 60, 10): - rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) -pylab.legend() - -pylab.subplot(233) -pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Gaussian') -pylab.xlabel(r'$\chi^{2}$') -pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['gaussian'], density=True, range=(0,200), bins=100) -x = range(0, int(bin_edges[-1])) -for ndof in range(10, 60, 10): - rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) -pylab.legend() - -pylab.subplot(234) -pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Exponential') -pylab.xlabel(r'$\chi^{2}$') -pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['exponential'], density=True, range=(0,200), bins=100) -x = range(0, int(bin_edges[-1])) -for ndof in range(10, 60, 10): - rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) -pylab.legend() - -pylab.subplot(235) -pylab.title(r'$\chi^{2}$ Test Statistic Distribution - BiGaussian') -pylab.xlabel(r'$\chi^{2}$') -pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['bigaussian'], density=True, range=(0,200), bins=100) -x = range(0, int(bin_edges[-1])) -for ndof in range(10, 60, 10): - rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) -pylab.legend() - -pylab.subplot(236) -pylab.title(r'$\chi^{2}$ Test Statistic Distributions') -pylab.xlabel(r'$\chi^{2}$') -pylab.ylabel('Probability Density') -pylab.hist(T_dist['uniform'], histtype='step', label='Uniform', density=True, range=(0,200), bins=100) -pylab.hist(T_dist['bigaussian'], histtype='step',label='BiGaussian', density=True, range=(0,200), bins=100) -pylab.hist(T_dist['gaussian'], histtype='step',label='Gaussian', density=True, range=(0,200), bins=100) -pylab.hist(T_dist['exponential'], histtype='step',label='Exponential', density=True, range=(0,200), bins=100) -pylab.hist(T_dist['chisq'], histtype='step',label='ChisSq', density=True, range=(0,200), bins=100) -pylab.legend() - -pylab.show() - diff --git a/examples/standard_distribution_comparisons.py b/examples/standard_distribution_comparisons.py index 4d81c4f..e2d7ca6 100644 --- a/examples/standard_distribution_comparisons.py +++ b/examples/standard_distribution_comparisons.py @@ -1,5 +1,15 @@ #!/usr/bin/env python +''' +This example illustrates the failure of traditional statistical +comparisons, such as chi^2, for weighted histograms from +stochastic processes. Can't rely on the p-value calculated +from standard tools such as scipy even in this simple example. + +Histogram the arrival time, expected to be gaussian, of the charge +sampled from a gaussian. +''' + import collections import numpy @@ -9,124 +19,162 @@ import voka.metrics.chisq -# This example illustrates the failure of traditional statistical -# comparisons, such as chi^2, for weighted histograms from -# stochastic processes. Can't rely on the p-value calculated -# from standard tools such as scipy even in this simple example. +histograms = dict() +N_HISTOGRAMS = 1000 +BINS = 50 +SIZE = 1000 +for i in range(N_HISTOGRAMS): + data = numpy.random.uniform(low=-5, high=5, size=SIZE) + histograms['Uniform%d' % i] = numpy.histogram(data, bins=BINS) + + data = numpy.random.chisquare(df=5, size=SIZE) + histograms['ChiSq5%d' % i] = numpy.histogram(data, bins=BINS) -# Histogram the arrival time, expected to be gaussian, of the charge -# sampled from a gaussian. + data = numpy.random.normal(size=SIZE) + histograms['Gaussian%d' % i] = numpy.histogram(data, bins=BINS) + + data = numpy.random.normal(size=int(SIZE/2), loc=-1, scale=0.25) + \ + numpy.random.normal(size=int(SIZE/2), loc=1, scale=0.25) + histograms['BiGaussian%d' % i] = numpy.histogram(data, bins=BINS) -histograms = dict() -n_histograms = 1000 -bins=50 -size=1000 -for i in range(n_histograms): - data = numpy.random.uniform(low=-5, high=5, size=size) - histograms['Uniform%d' % i] = numpy.histogram(data, bins=bins) - - data = numpy.random.chisquare(df=5, size=size) - histograms['ChiSq5%d' % i] = numpy.histogram(data, bins=bins) - - data = numpy.random.normal(size=size) - histograms['Gaussian%d' % i] = numpy.histogram(data, bins=bins) - - data = numpy.random.normal(size=int(size/2), loc=-1, scale = 0.25) + numpy.random.normal(size=int(size/2), loc=1, scale = 0.25) - histograms['BiGaussian%d' % i] = numpy.histogram(data, bins=bins) - data = numpy.random.exponential(scale=10, size=1000) - histograms['Exponential%d' % i] = numpy.histogram(data, bins=bins) + histograms['Exponential%d' % i] = numpy.histogram(data, bins=BINS) T_dist = collections.defaultdict(list) test_stat = voka.metrics.chisq.NormChiSq() -for i in range(n_histograms): - for j in range(i+1, n_histograms): +for i in range(N_HISTOGRAMS): + for j in range(i+1, N_HISTOGRAMS): h1 = histograms["Uniform%d" % i][0] - h2 = histograms["Uniform%d" % j][0] + h2 = histograms["Uniform%d" % j][0] T_dist['uniform'].append(test_stat(h1, h2)) h1 = histograms["ChiSq5%d" % i][0] - h2 = histograms["ChiSq5%d" % j][0] + h2 = histograms["ChiSq5%d" % j][0] T_dist['chisq'].append(test_stat(h1, h2)) - + h1 = histograms["Exponential%d" % i][0] - h2 = histograms["Exponential%d" % j][0] + h2 = histograms["Exponential%d" % j][0] T_dist['exponential'].append(test_stat(h1, h2)) - + h1 = histograms["Gaussian%d" % i][0] - h2 = histograms["Gaussian%d" % j][0] + h2 = histograms["Gaussian%d" % j][0] T_dist['gaussian'].append(test_stat(h1, h2)) h1 = histograms["BiGaussian%d" % i][0] - h2 = histograms["BiGaussian%d" % j][0] + h2 = histograms["BiGaussian%d" % j][0] T_dist['bigaussian'].append(test_stat(h1, h2)) - + pylab.figure(1) pylab.subplot(231) pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Uniform') pylab.xlabel(r'$\chi^{2}$') pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['uniform'], density=True, range=(0,200), bins=100) +bin_values, bin_edges, patches = pylab.hist(T_dist['uniform'], + density=True, + range=(0, 200), + bins=100) x = range(0, int(bin_edges[-1])) for ndof in range(10, 60, 10): rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) pylab.legend() pylab.subplot(232) pylab.title(r'$\chi^{2}$ Test Statistic Distribution - ChiSq') pylab.xlabel(r'$\chi^{2}$') pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['chisq'], density=True, range=(0,200), bins=100) +bin_values, bin_edges, patches = pylab.hist(T_dist['chisq'], + density=True, + range=(0, 200), + bins=100) x = range(0, int(bin_edges[-1])) for ndof in range(10, 60, 10): rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) pylab.legend() pylab.subplot(233) pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Gaussian') pylab.xlabel(r'$\chi^{2}$') pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['gaussian'], density=True, range=(0,200), bins=100) +bin_values, bin_edges, patches = pylab.hist(T_dist['gaussian'], + density=True, + range=(0, 200), + bins=100) x = range(0, int(bin_edges[-1])) for ndof in range(10, 60, 10): rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) pylab.legend() pylab.subplot(234) pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Exponential') pylab.xlabel(r'$\chi^{2}$') pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['exponential'], density=True, range=(0,200), bins=100) +bin_values, bin_edges, patches = pylab.hist(T_dist['exponential'], + density=True, + range=(0, 200), + bins=100) x = range(0, int(bin_edges[-1])) for ndof in range(10, 60, 10): rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) pylab.legend() pylab.subplot(235) pylab.title(r'$\chi^{2}$ Test Statistic Distribution - BiGaussian') pylab.xlabel(r'$\chi^{2}$') pylab.ylabel('Probability Density') -bin_values, bin_edges, patches = pylab.hist(T_dist['bigaussian'], density=True, range=(0,200), bins=100) +bin_values, bin_edges, patches = pylab.hist(T_dist['bigaussian'], + density=True, + range=(0, 200), + bins=100) x = range(0, int(bin_edges[-1])) for ndof in range(10, 60, 10): rv = scipy.stats.chi2(ndof) - pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) pylab.legend() pylab.subplot(236) pylab.title(r'$\chi^{2}$ Test Statistic Distributions') pylab.xlabel(r'$\chi^{2}$') pylab.ylabel('Probability Density') -pylab.hist(T_dist['uniform'], histtype='step', label='Uniform', density=True, range=(0,200), bins=100) -pylab.hist(T_dist['bigaussian'], histtype='step',label='BiGaussian', density=True, range=(0,200), bins=100) -pylab.hist(T_dist['gaussian'], histtype='step',label='Gaussian', density=True, range=(0,200), bins=100) -pylab.hist(T_dist['exponential'], histtype='step',label='Exponential', density=True, range=(0,200), bins=100) -pylab.hist(T_dist['chisq'], histtype='step',label='ChisSq', density=True, range=(0,200), bins=100) + +pylab.hist(T_dist['uniform'], + histtype='step', + label='Uniform', + density=True, + range=(0, 200), + bins=100) + +pylab.hist(T_dist['bigaussian'], + histtype='step', + label='BiGaussian', + density=True, + range=(0, 200), + bins=100) + +pylab.hist(T_dist['gaussian'], + histtype='step', + label='Gaussian', + density=True, + range=(0, 200), + bins=100) + +pylab.hist(T_dist['exponential'], + histtype='step', + label='Exponential', + density=True, + range=(0, 200), + bins=100) + +pylab.hist(T_dist['chisq'], + histtype='step', + label='ChisSq', + density=True, + range=(0, 200), + bins=100) + pylab.legend() pylab.show() - diff --git a/examples/stochastic_example.py b/examples/stochastic_example.py index bc69ed1..2429ec7 100755 --- a/examples/stochastic_example.py +++ b/examples/stochastic_example.py @@ -1,5 +1,14 @@ #!/usr/bin/env python +''' +This example illustrates the failure of traditional statistical +comparisons, such as chi^2, for weighted histograms from +stochastic processes. + +Histogram the arrival time, expected to be gaussian, of the charge +sampled from a gaussian. +''' + import numpy import pylab import scipy.optimize @@ -7,17 +16,9 @@ import voka.metrics.chisq -# This example illustrates the failure of traditional statistical -# comparisons, such as chi^2, for weighted histograms from -# stochastic processes. - -# Histogram the arrival time, expected to be gaussian, of the charge -# sampled from a gaussian. - histograms = dict() -n_histograms = 100 - -for i in range(n_histograms): +N_HISTOGRAMS = 100 +for i in range(N_HISTOGRAMS): charge_data = list() time_data = numpy.random.normal(size=1000) for time in time_data: @@ -28,33 +29,31 @@ charge_data.append(time) histograms['ChargeHistogram%d' % i] = numpy.histogram(charge_data) histograms['TimeHistogram%d' % i] = numpy.histogram(time_data) - charge_T_dist = list() time_T_dist = list() test_stat = voka.metrics.chisq.NormChiSq() -ndof = None -bin_centers = None -for i in range(n_histograms): - for j in range(i+1, n_histograms): + +NDOF = None +for i in range(N_HISTOGRAMS): + for j in range(i+1, N_HISTOGRAMS): ch1 = histograms["ChargeHistogram%d" % i][0] ch2 = histograms["ChargeHistogram%d" % j][0] th1 = histograms["TimeHistogram%d" % i][0] th2 = histograms["TimeHistogram%d" % j][0] - if not ndof: - ndof = len(ch1) - + if not NDOF: + NDOF = len(ch1) + charge_T_dist.append(test_stat(ch1, ch2)) time_T_dist.append(test_stat(th1, th2)) - -rv = scipy.stats.chi2(ndof) + +rv = scipy.stats.chi2(NDOF) pylab.figure(1) pylab.hist(charge_T_dist, density=True, bins=100) pylab.figure(2) -pylab.hist(time_T_dist, density=True, range=(0,100), bins=100) +pylab.hist(time_T_dist, density=True, range=(0, 100), bins=100) x = range(100) pylab.plot(x, rv.pdf(x), 'k-') pylab.show() - diff --git a/examples/traditional_example.py b/examples/traditional_example.py new file mode 100755 index 0000000..e467154 --- /dev/null +++ b/examples/traditional_example.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python + +''' +This example illustrates the failure of traditional statistical +comparisons, such as chi^2, for weighted histograms from +stochastic processes. Can't rely on the p-value calculated +from standard tools such as scipy even in this simple example. + +Histogram the arrival time, expected to be gaussian, of the charge +sampled from a gaussian. +''' + +import collections + +import numpy +import pylab +import scipy.optimize +import scipy.stats + +import voka.metrics.chisq + +histograms = dict() +N_HISTOGRAMS = 1000 +BINS = 50 +SIZE = 1000 +for i in range(N_HISTOGRAMS): + data = numpy.random.uniform(low=-5, high=5, size=SIZE) + histograms['Uniform%d' % i] = numpy.histogram(data, bins=BINS) + + data = numpy.random.chisquare(df=5, size=SIZE) + histograms['ChiSq5%d' % i] = numpy.histogram(data, bins=BINS) + + data = numpy.random.normal(size=SIZE) + histograms['Gaussian%d' % i] = numpy.histogram(data, bins=BINS) + + data = numpy.random.normal(size=int(SIZE/2), + loc=-1, + scale=0.25) +\ + numpy.random.normal(size=int(SIZE/2), + loc=1, + scale=0.25) + histograms['BiGaussian%d' % i] = numpy.histogram(data, bins=BINS) + + data = numpy.random.exponential(scale=10, size=1000) + histograms['Exponential%d' % i] = numpy.histogram(data, bins=BINS) + +T_dist = collections.defaultdict(list) +test_stat = voka.metrics.chisq.NormChiSq() +for i in range(N_HISTOGRAMS): + for j in range(i+1, N_HISTOGRAMS): + h1 = histograms["Uniform%d" % i][0] + h2 = histograms["Uniform%d" % j][0] + T_dist['uniform'].append(test_stat(h1, h2)) + + h1 = histograms["ChiSq5%d" % i][0] + h2 = histograms["ChiSq5%d" % j][0] + T_dist['chisq'].append(test_stat(h1, h2)) + + h1 = histograms["Exponential%d" % i][0] + h2 = histograms["Exponential%d" % j][0] + T_dist['exponential'].append(test_stat(h1, h2)) + + h1 = histograms["Gaussian%d" % i][0] + h2 = histograms["Gaussian%d" % j][0] + T_dist['gaussian'].append(test_stat(h1, h2)) + + h1 = histograms["BiGaussian%d" % i][0] + h2 = histograms["BiGaussian%d" % j][0] + T_dist['bigaussian'].append(test_stat(h1, h2)) + +pylab.figure(1) +pylab.subplot(231) +pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Uniform') +pylab.xlabel(r'$\chi^{2}$') +pylab.ylabel('Probability Density') +bin_values, bin_edges, patches = pylab.hist(T_dist['uniform'], + density=True, + range=(0, 200), + bins=100) +x = range(0, int(bin_edges[-1])) +for ndof in range(10, 60, 10): + rv = scipy.stats.chi2(ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) +pylab.legend() + +pylab.subplot(232) +pylab.title(r'$\chi^{2}$ Test Statistic Distribution - ChiSq') +pylab.xlabel(r'$\chi^{2}$') +pylab.ylabel('Probability Density') +bin_values, bin_edges, patches = pylab.hist(T_dist['chisq'], + density=True, + range=(0, 200), + bins=100) +x = range(0, int(bin_edges[-1])) +for ndof in range(10, 60, 10): + rv = scipy.stats.chi2(ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) +pylab.legend() + +pylab.subplot(233) +pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Gaussian') +pylab.xlabel(r'$\chi^{2}$') +pylab.ylabel('Probability Density') +bin_values, bin_edges, patches = pylab.hist(T_dist['gaussian'], + density=True, + range=(0, 200), + bins=100) +x = range(0, int(bin_edges[-1])) +for ndof in range(10, 60, 10): + rv = scipy.stats.chi2(ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) +pylab.legend() + +pylab.subplot(234) +pylab.title(r'$\chi^{2}$ Test Statistic Distribution - Exponential') +pylab.xlabel(r'$\chi^{2}$') +pylab.ylabel('Probability Density') +bin_values, bin_edges, patches = pylab.hist(T_dist['exponential'], + density=True, + range=(0, 200), + bins=100) +x = range(0, int(bin_edges[-1])) +for ndof in range(10, 60, 10): + rv = scipy.stats.chi2(ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) +pylab.legend() + +pylab.subplot(235) +pylab.title(r'$\chi^{2}$ Test Statistic Distribution - BiGaussian') +pylab.xlabel(r'$\chi^{2}$') +pylab.ylabel('Probability Density') +bin_values, bin_edges, patches = pylab.hist(T_dist['bigaussian'], + density=True, + range=(0, 200), + bins=100) +x = range(0, int(bin_edges[-1])) +for ndof in range(10, 60, 10): + rv = scipy.stats.chi2(ndof) + pylab.plot(x, rv.pdf(x), label='NDOF=%d' % ndof) +pylab.legend() + +pylab.subplot(236) +pylab.title(r'$\chi^{2}$ Test Statistic Distributions') +pylab.xlabel(r'$\chi^{2}$') +pylab.ylabel('Probability Density') + +pylab.hist(T_dist['uniform'], + histtype='step', + label='Uniform', + density=True, + range=(0, 200), + bins=100) + +pylab.hist(T_dist['bigaussian'], + histtype='step', + label='BiGaussian', + density=True, + range=(0, 200), + bins=100) + +pylab.hist(T_dist['gaussian'], + histtype='step', + label='Gaussian', + density=True, + range=(0, 200), + bins=100) + +pylab.hist(T_dist['exponential'], + histtype='step', + label='Exponential', + density=True, + range=(0, 200), + bins=100) + +pylab.hist(T_dist['chisq'], + histtype='step', + label='ChisSq', + density=True, + range=(0, 200), + bins=100) + +pylab.legend() + +pylab.show() diff --git a/examples/vanilla_gaussian.py b/examples/vanilla_gaussian.py index ce83396..5ae2211 100755 --- a/examples/vanilla_gaussian.py +++ b/examples/vanilla_gaussian.py @@ -1,5 +1,10 @@ #!/usr/bin/env python +''' +Vanilla gaussian example. +''' + + import numpy import pylab import scipy.optimize @@ -8,52 +13,55 @@ import voka.metrics.chisq histograms = dict() -n_histograms = 1000 -scale = 1.0 -loc = 0.0 -size = 1000 -params = (size/(scale * numpy.sqrt(2 * numpy.pi)), loc, scale) -for i in range(n_histograms): - data = numpy.random.normal(size=size, loc=loc, scale=scale) - histograms['Histogram%d' % i] = numpy.histogram(data) +N_HISTOGRAMS = 1000 +SCALE = 1.0 +LOC = 0.0 +SIZE = 1000 +params = (SIZE/(SCALE * numpy.sqrt(2 * numpy.pi)), LOC, SCALE) +for i in range(N_HISTOGRAMS): + data = numpy.random.normal(size=SIZE, loc=LOC, scale=SCALE) + histograms['Histogram%d' % i] = numpy.histogram(data) def gauss(x, *p): - A, mu, sigma = p - return A*numpy.exp(-(x-mu)**2/(2.*sigma**2)) - + ''' + Gaussian distribution + ''' + amplitude, mean, sigma = p + return amplitude*numpy.exp(-(x-mean)**2/(2.*sigma**2)) + test_stat = voka.metrics.chisq.NormChiSq() -ndof = None +NDOF = None T_dist = list() -for i in range(n_histograms): +for i in range(N_HISTOGRAMS): h = histograms["Histogram%d" % i] bin_values = h[0] bin_edges = h[1] - bin_centers = (bin_edges[:-1] + bin_edges[1:])/2 + bin_centers = (bin_edges[:-1] + bin_edges[1:])/2 bw = bin_centers[1] - bin_centers[0] - expectation = [gauss(x, bw*params[0], loc, scale) for x in bin_centers] - - if not ndof: - ndof = len(bin_values) - 2 - + expectation = [gauss(x, bw*params[0], LOC, SCALE) for x in bin_centers] + + if not NDOF: + NDOF = len(bin_values) - 2 + T_dist.append(test_stat(bin_values, expectation)) - + pylab.figure(1) pylab.hist(T_dist, bins=100) - + pylab.figure(2) -rv = scipy.stats.chi2(ndof) -x = range(20) -pylab.plot(x, rv.pdf(x), 'k-') +rv = scipy.stats.chi2(NDOF) +x_values = range(20) +pylab.plot(x_values, rv.pdf(x_values), 'k-') pylab.figure(3) h = histograms["Histogram0"] bin_values = h[0] bin_edges = h[1] -bin_centers = (bin_edges[:-1] + bin_edges[1:])/2 +bin_centers = (bin_edges[:-1] + bin_edges[1:])/2 bw = bin_centers[1] - bin_centers[0] print("bw = %f" % bw) pylab.plot(bin_centers, bin_values) -pylab.plot(bin_centers, [gauss(x, bw*params[0], loc, scale) for x in bin_centers]) +pylab.plot(bin_centers, [gauss(bc, bw*params[0], LOC, SCALE) + for bc in bin_centers]) pylab.show() -