-
Notifications
You must be signed in to change notification settings - Fork 1
/
sampler.py
302 lines (283 loc) · 11.6 KB
/
sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""Util for sampling properties of the minimal DFA."""
from __future__ import division
import sys
import pandas as pd
from collections import deque
sys.path.append('..')
import merge_alg
import arbitrary_probs_util as string_util
class DepthSampler(object):
"""Class for sampling properties by depth of the minimal DFA."""
def __init__(self, probs, length):
"""Constructor for Sampler
Args:
probs: List of probs of seeing each letter.
length: The length of the generalized strings to sample.
"""
self.probs = probs
self.length = length
self.alphabet = string_util.get_default_alphabet(len(probs))
self.SAMPLE_MAP = {
'states': self._get_states_per_depth,
'threads': self._get_threads_per_depth,
'thread_children': self._get_thread_children,
'total_children': self._get_total_children,
'new_thread': self._get_has_new_thread,
'merge_degree': self._get_merged_degree,
'failure_chain_lengths': self._get_failure_chain_lengths,
'failure_chain_path': self._get_single_failure_chain_path,
'growth_ratio': self._get_growth_ratio,
}
def draw_samples(self, num_samples, props):
"""Draws samples and finds the requested features.
Args:
num_samples: Number of samples to observe.
props: The properties to be observed, should be key in SAMPLE_MAP.
Returns: Pandas dataframe with samples and requested properties.
"""
df_columns = list(props)
df_columns.insert(0, 'depth')
to_return = pd.DataFrame(columns=df_columns)
# Draw sample.
for _ in xrange(num_samples):
gen_string = string_util.create_random_string(self.probs,
self.length)
min_dfa = merge_alg.aho_merge(gen_string, self.alphabet)
root = min_dfa.get_root()
# Analyze given properties.
sample_props = [self.SAMPLE_MAP[prop](root) for prop in props]
depths = range(len(sample_props[0]))
sample_props.insert(0, depths)
df_to_add = pd.DataFrame({col: sample_props[index]
for index, col in enumerate(df_columns)})
to_return = to_return.append(df_to_add)
return to_return.apply(pd.to_numeric, errors='ignore')
def _get_states_per_depth(self, root):
"""Finds the number of states per depth.
Args:
root: Root of the DFA.
Returns: List where each entry is how many states are in the depth
corresponding with the index.
"""
to_return = []
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
while len(queue) > 0:
curr_node = queue.pop()
if len(to_return) < curr_node.depth + 1:
to_return.append(0)
to_return[curr_node.depth] += 1
for child in curr_node.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
return to_return
def _get_threads_per_depth(self, root):
"""Finds the number of threads (failures not going to root) per depth.
Args:
root: Root of the DFA.
Returns: List where each entry is how many states are in the depth
corresponding with the index.
"""
to_return = []
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
while len(queue) > 0:
curr_node = queue.pop()
if len(to_return) < curr_node.depth + 1:
to_return.append(0)
if _is_thread(curr_node):
to_return[curr_node.depth] += 1
for child in curr_node.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
return to_return
def _get_thread_children(self, root):
"""Samples the number of children that each thread has for depths.
Critical decision made for this is that only count children once ever.
i.e. if two threads merge the offspring is only counted for one of them.
Args:
root: Root of the DFA.
Returns: List of lists where each inner list has the samples and its
index is the depth in the DFA.
"""
to_return = []
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
while len(queue) > 0:
curr_node = queue.pop()
if len(to_return) < curr_node.depth + 1:
to_return.append([])
thread_children = 0 if _is_thread(curr_node) else None
for child in curr_node.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
if (_is_thread(curr_node) and _is_offspring(child)
and child.failure.depth > 1):
thread_children += 1
if thread_children is not None:
to_return[curr_node.depth].append(thread_children)
return to_return
def _get_total_children(self, root):
"""Samples the TOTAL number of children that each thread has for depths.
This will count every child, i.e. double count merged children.
Args:
root: Root of the DFA.
Returns: List of lists where each inner list has the samples and its
index is the depth in the DFA.
"""
to_return = []
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
while len(queue) > 0:
curr_node = queue.pop()
if len(to_return) < curr_node.depth + 1:
to_return.append([])
if _is_thread(curr_node):
thread_children = 0
for child in set(curr_node.goto.values()):
if _is_thread(child):
thread_children += 1
to_return[curr_node.depth].append(thread_children)
for child in curr_node.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
return to_return
def _get_has_new_thread(self, root):
"""Finds whether each depth has a new thread or not.
Args:
root: The root of the DFA.
Returns: List where there is a 0 if the corresponding depth does not
have a new thread and 1 if it does.
"""
to_return = []
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
while len(queue) > 0:
curr_node = queue.pop()
if len(to_return) < curr_node.depth + 1:
to_return.append(0)
if curr_node.failure is not None and curr_node.failure.depth == 1:
to_return[curr_node.depth] = 1
for child in curr_node.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
return to_return
def _get_merged_degree(self, root):
"""If a thread comes from the merging of multiple threads, returns
number of parents for that node minus one.
Args:
root: The root of the DFA.
Returns: List of lists where each inner list contains degree for each
merged node in the depth corresponding to the inner list index.
"""
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
# List of dictionaries where index i corresponds to depth i + 1.
degree_mappings = [{}]
while len(queue) > 0:
curr_node = queue.pop()
if len(degree_mappings) < curr_node.depth + 2:
degree_mappings.append({})
if _is_thread(curr_node):
degrees = degree_mappings[curr_node.depth + 1]
for child in set(curr_node.goto.values()):
if _is_thread(child):
if child.sid in degrees.keys():
degrees[child.sid] += 1
else:
degrees[child.sid] = 0
for child in curr_node.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
# Trim off last entry since the last depth has no children.
return ([[deg for deg in depth.values() if deg > 0]
for depth in degree_mappings][:-1])
def _get_failure_chain_lengths(self, root):
"""Get the length of the failure chain for each state.
Args:
root: The root of the DFA.
Returns: List of lists where each inner list contains the length of the
failure chain for each of the states in the depth.
"""
queue = deque()
queue.appendleft(root)
seen = set()
seen.add(root.sid)
results = []
while len(queue) > 0:
curr_node = queue.pop()
if len(results) < curr_node.depth + 1:
results.append([])
results[curr_node.depth].append(_count_failure_chain(curr_node))
for child in curr_node.goto.values():
if child.sid not in seen:
queue.appendleft(child)
seen.add(child.sid)
return results
def _get_single_failure_chain_path(self, root):
"""Get the lengths of failure chains for states across one single path.
Args:
root: The root of the DFA.
Returns: List of integers each representing the size of the failure
chain at that particular depth.
"""
path = [0, 1]
# Iterate through to find first state at appropriate depth.
state = root
for child in state.goto.values():
if child is not state:
state = child
break
while len(state.goto) > 0:
state = state.goto.values()[0]
chain_length = _count_failure_chain(state)
path.append(chain_length)
return path
def _get_growth_ratio(self, root):
"""Get the growth ratio i.e. (states in depth n + 1)/(states in depth n)
Args:
root: The root of the DFA.
Returns: list of the ratios, note the last index is 0.
"""
# This is bad because could possibly call get states twice, doing double
# the work rather than reusing but oh well.
states = self._get_states_per_depth(root)
ratios = [0 for _ in xrange(self.length + 1)]
for ind in xrange(self.length):
ratios[ind] = states[ind + 1] / states[ind]
return ratios
def _count_failure_chain(state):
count = 0
curr = state
while curr.failure is not None:
curr = curr.failure
count += 1
return count
def _is_thread(state):
return state.failure is not None and state.failure.depth > 0
def _is_offspring(state):
return state.failure is not None and state.failure.depth > 1
if __name__ == '__main__':
DS = DepthSampler([0.5 for _ in range(4)], 4)
DF = DS.draw_samples(1, ['states', 'threads', 'thread_children',
'new_thread', 'merge_degree', 'total_children',
'failure_chain_lengths', 'failure_chain_path'])
print DF