-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf_tools.py
298 lines (218 loc) · 8.57 KB
/
pdf_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
"""Defines basic forms of probability distributions used in the packages.
TODO: since occurrence rate must be binned in radius; one option
is to have one bin per complenetess bin (i.e same radius
intervals), so that it's more general.
For this to work however; then I would set a Gaussian prior
on the radius.
TODO: consider the possibility of calculating the integral in the
Poisson likelihood simply through basic Monte-Carlo.
TODO: create bin-grid object.
TODO: decide on logP vs P, and same for R
TODO: change HP (occr) to log
"""
import numpy as np
class BinnedCompleteness(object):
"""Probability object that defines the binned completeness function.
TODO: determine if this is the observable or real occurrence rate;
Better: this is a general function that doesn't care; instead
observable completeness will be a combination of these and-or
others.
Alternatively: this is the observable occurrence (or that
inherits this), and is a composition of the real binned
completenes and the occurrence rate function (which is a callable
that is set during initiation).
TODO: create internal properties; binning parameters, plus a function
to go from (R or P) value to bin-index (independent and not).
TODO: auto-integrate function (i.e get N_expected)
"""
def __init__(self, R_boundaries, P_boundaries, value_grid,
occurrence_rate_func, outside_value=0.0):
"""Creates the probability from a binned array of completeness values.
Completeness grid has a N x M dimension; with N R-bins,
and M P-bins.
Args:
R_boundaries (np.array, (N+1)-dim): the boundaries of the bins
in R space
P_boundaries (np.array, (M+1)-dim): the boundaries of the bins
in P space
value_grid (np.ndarray, shape: NxM): the values of the
*completeness* in each bin
"""
raise NotImplementedError
class ObservableOccurrenceBasic(object):
"""Probability object defines the observable occurrence pdf.
The completeness is treated as binned in a set number of bins
in period and planet radius. The true occurence is treated as
binned only in radius in a set number of intervals.
NOTE: regarding logP vs P; this object doesn't care, at least
in the calculations. That's an outside consideration.
-------------
TODO: determine if this is the observable or real occurrence rate;
Better: this is a general function that doesn't care; instead
observable completeness will be a combination of these and-or
others.
Alternatively: this is the observable occurrence (or that
inherits this), and is a composition of the real binned
completenes and the occurrence rate function (which is a callable
that is set during initiation).
TODO: create internal properties; binning parameters, plus a function
to go from (R or P) value to bin-index (independent and not).
TODO: auto-integrate function (i.e get N_expected)
TODO: for the above; cache the value, so that it isn't
recalculated unless the hyperparameters (theta or
something) change.
In order for this to work; we need strong setter
and getter functions on the hyperparameters and
completeness (or have their values hidden).
"""
def __init__(self, R_boundaries, P_boundaries, cpf_value_grid,
N_stars, outside_value=0.0, event_values=None):
"""Creates the probability from array of completeness values.
Completeness grid has a N x M dimension; with N R-bins,
and M P-bins.
Occurence rate is binned in radius, in 1 Rj intervals,
for now.
Args:
R_boundaries (np.array, (N+1)-dim): the boundaries of
the bins in R space
P_boundaries (np.array, (M+1)-dim): the boundaries of
the bins in P space
value_grid (np.ndarray, shape: NxM): the values of the
*completeness* in each bin
"""
self._R_boundaries = np.array(R_boundaries)
self._P_boundaries = np.array(P_boundaries)
self._cpf_grid = np.array(cpf_value_grid)
self._N_stars = N_stars
# Flag; if True, the parameters have been unchanged since
# the last time the rate integral was calculated; therefore
# the cached integral is safe to use.
self._int_cache_flag = False
self._int_cache = 0.0
# Initiate the occurrence rate in same bins of R as cpf
self.occr_array = np.zeros(np.shape(self)[0])
# Initiate event values
self._event_values = event_values
def __call__(self, *args, **kwargs):
"""Calculates the value of the likelihood.
Args:
event_values (np.array or float): the R, P pairs to calculate
the likelihood. If None or empty, calculates it
assuming zero events. Otherwise, expects a single
(R, P) coordinate, or N values of (R, P), i.e
shape (N x 2).
"""
return self.log_likelihood(*args, **kwargs)
def likelihood(self, occr=None, event_values=None):
"""Calculates the value of the likelihood.
Args:
event_values (np.array or float): the R, P pairs to calculate
the likelihood. If None or empty, calculates it
assuming zero events. Otherwise, expects a single
(R, P) coordinate, or N values of (R, P), i.e
shape (N x 2).
"""
if occr is not None:
if (occr < 0.0).any():
return 0.0
self.occr = occr
if event_values is not None:
self.event_values = event_values
I = self.calc_integral() * self._N_stars
# Case of no events
if self.event_values is None or not hasattr(self.event_values,
'__len__'):
return np.exp(-I)
else:
return np.exp(-I) * np.prod(self.rate_density(self.event_values))
def log_likelihood(self, occr=None, event_values=None):
"""Calculates the value of the likelihood.
Args:
event_values (np.array or float): the R, P pairs to calculate
the likelihood. If None or empty, calculates it
assuming zero events. Otherwise, expects a single
(R, P) coordinate, or N values of (R, P), i.e
shape (N x 2).
"""
if occr is not None:
if (occr < 0.0).any():
return -np.inf
self.occr = occr
if event_values is not None:
self.event_values = event_values
I = self.calc_integral() * self._N_stars
# Case of no events
if self.event_values is None or not hasattr(self.event_values,
'__len__'):
return - I
else:
return np.sum(np.log(self.rate_density(self.event_values))) - I
def calc_integral(self):
"""Calculates the in TODO"""
# Return the cached value if possible
if self._int_cache_flag:
return self._int_cache
I = np.sum(self.occr * np.sum(self._cpf_grid * self.calc_bin_volumes(),
axis=1)
)
self._int_cache_flag = True
self._int_cache = I
return I
def calc_bin_volumes(self):
"""Calculates the array of areas (Lebesque measure) per bin."""
return np.outer(np.diff(self._R_boundaries),
np.diff(self._P_boundaries))
def rate_density(self, value):
"""Returns the rate density at a particular value of (R, P)."""
if value.ndim == 2:
value = value.T
R_i = np.digitize(value[0], self._R_boundaries) - 1
P_i = np.digitize(value[1], self._P_boundaries) - 1
return self.occr[R_i] * self._cpf_grid[R_i, P_i]
# Estimators
# ----------
def predict_rate_grid(self, N_stars=1):
"""Predicts the rate at each cpf-grid bin."""
rate_grid = (self._cpf_grid * self.calc_bin_volumes()).T * self.occr
return N_stars * rate_grid
def marginalise_occr_period(self):
"""Marginalises the occurence rate over the range of periods."""
return self.occr * self.calc_bin_volumes().sum(axis=1)
# Properties
# ----------
# TODO: make sure object references work as expected (i.e copies)
@property
def shape(self):
"""Gives the shape of the completeness array."""
return np.array([len(self._R_boundaries)-1, len(self._P_boundaries)-1])
def get_occr(self):
return self.occr_array
def set_occr(self, array):
if np.shape(array) != np.shape(self)[0]:
raise ValueError("Input array is the wrong shape.")
elif (array < 0.0).any():
raise ValueError("Negative occurrence rate is invalid.")
self._int_cache_flag = False
self.occr_array = array
def get_event_values(self):
return self._event_values
def set_event_values(self, array):
self._int_cache_flag = False
if array is None:
self._event_values = None
elif np.ndim(array) == 1:
self._event_values = np.array(array).reshape([1, 2])
elif (np.ndim(array) == 2) and (np.shape(array)[1] == 2):
self._event_values = np.array(array)
else:
raise ValueError('Invalid entry to event values '
'(check the shape).')
occr = property(get_occr, set_occr)
event_values = property(get_event_values, set_event_values)
class BinGrid(object):
"""Class that overwrites a pandas DataFrame and acts as
bin-grid function.
Should contain a callable (funct-at); and properly define
intervals and boundaries.
"""
pass