-
Notifications
You must be signed in to change notification settings - Fork 0
/
features.h
495 lines (441 loc) · 15.3 KB
/
features.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
/**
* \file features.h
*
* This file contains structures for representing sets of discrete features,
* where each feature_set has `n` features, and each feature is of type
* `unsigned int`.
*
* <!-- Created on: Dec 1, 2016
* Author: asaparov -->
*/
#ifndef FEATURES_H_
#define FEATURES_H_
#include <core/array.h>
using namespace core;
/**
* This struct represents a sequence of features, with length
* feature_set::feature_count, each feature having type `unsigned int`.
*/
struct feature_set {
/**
* The native array that stores the feature values.
*/
unsigned int* features;
/**
* The number of features and the length of feature_set::features,
* feature_set::excluded_counts, and the first dimension of
* feature_set::excluded.
*/
unsigned int feature_count;
/**
* A two-dimensional array containing *excluded* feature values. That is,
* `excluded[i]` contains the set of excluded feature values for the
* feature at index `i`. Each `excluded[i]` is a sorted array of
* `unsigned int` feature values containing distinct elements. The length
* of `excluded[i]` is given by `feature_set::excluded_counts[i]`. For each
* `j` such that `feature_set::excluded_counts[j] == 0`, `excluded[j]` is
* `NULL`.
*/
unsigned int** excluded;
/**
* An array that stores the length of each `feature_set::excluded[i]`.
*/
unsigned int* excluded_counts;
/**
* Initializes this feature set with the given `feature_count`.
* Each element in feature_set::features is uninitialized, whereas
* feature_set::excluded_counts is initialized to all zeros.
*/
feature_set(unsigned int feature_count) : feature_count(feature_count) {
if (!initialize()) exit(EXIT_FAILURE);
}
/**
* Initializes this feature set by copying from the given feature_set `src`.
*/
feature_set(const feature_set& src) : feature_count(src.feature_count) {
if (!initialize(src)) exit(EXIT_FAILURE);
}
~feature_set() { free(); }
/**
* Returns the feature value at the given `index`.
*/
inline unsigned int operator [] (unsigned int index) const {
return features[index];
}
/**
* Returns the feature value at the given `index`.
*/
inline unsigned int& operator [] (unsigned int index) {
return features[index];
}
/**
* Sets the feature value at the given `index`.
*/
inline void set_feature(unsigned int index, unsigned int value) {
features[index] = value;
}
/**
* Returns `true` if the given feature `value` is excluded from the feature
* at the given `index`. Otherwise, this function returns `false`.
*/
inline bool is_excluded(unsigned int index, unsigned int value) const {
return index_of(value, excluded[index], excluded_counts[index]) < excluded_counts[index];
}
/**
* Checks that the excluded array at `feature_set::excluded[index]` has
* sufficient `capacity`. If that array is `NULL`, this function will
* initialize it (leaving its elements uninitialized).
*/
bool ensure_excluded_capacity(unsigned int index, unsigned int capacity) {
if (excluded_counts[index] == 0) excluded[index] = NULL;
unsigned int* new_excluded = (unsigned int*) realloc(
excluded[index], sizeof(unsigned int) * capacity);
if (new_excluded == NULL) {
fprintf(stderr, "feature_list.excluded_unsorted ERROR: Out of memory.\n");
return false;
}
excluded[index] = new_excluded;
return true;
}
/**
* Initializes the excluded array at `feature_set::excluded[index]` and
* copies the contents from `src`. This function assumes
* `feature_set::excluded[index]` is previously uninitialized.
*/
bool set_excluded(unsigned int index, const unsigned int* src, unsigned int count) {
if (count == 0) return true;
excluded[index] = (unsigned int*) malloc(sizeof(unsigned int) * count);
if (excluded[index] == NULL) {
fprintf(stderr, "feature_set.set_excluded ERROR: Out of memory.\n");
return false;
}
excluded_counts[index] = count;
memcpy(excluded[index], src, sizeof(unsigned int) * count);
return true;
}
/**
* Excludes the given `item` from the feature at the given `index`, without
* sorting the excluded array `feature_set::excluded[index]`.
* `feature_set::excluded_counts[index]` is also incremented. This function
* assumes the excluded array does not already contain `item` and has
* sufficient capacity.
*/
void exclude_unsorted(unsigned int index, unsigned int item) {
excluded[index][excluded_counts[index]] = item;
excluded_counts[index]++;
}
/**
* Sorts the excluded array at `feature_set::excluded[index]`. This
* function assumes the excluded array is not `NULL`.
*/
void sort_excluded(unsigned int index) {
if (excluded_counts[index] == 0)
core::free(excluded[index]);
else insertion_sort(excluded[index], excluded_counts[index]);
}
/**
* Evaluates the hash function for the given feature_set `set`.
*/
static inline unsigned int hash(const feature_set& set) {
unsigned int hash = default_hash(set.features, set.feature_count);
for (unsigned int i = 0; i < set.feature_count; i++)
if (set.excluded_counts[i] > 0)
hash ^= default_hash(set.excluded[i], set.excluded_counts[i]);
return hash;
}
/**
* Moves the feature_set in `src` to `dst`. Note that this function does
* not initialize the fields in `dst` and copy the contents from the
* corresponding fields in `src` into `dst`. Rather, this function simply
* copies the pointers.
*/
static inline void move(const feature_set& src, feature_set& dst) {
dst.features = src.features;
dst.feature_count = src.feature_count;
dst.excluded = src.excluded;
dst.excluded_counts = src.excluded_counts;
}
/**
* Swaps the underlying pointers in the feature_sets `src` and `dst`.
*/
static inline void swap(feature_set& first, feature_set& second) {
core::swap(first.features, second.features);
core::swap(first.feature_count, second.feature_count);
core::swap(first.excluded, second.excluded);
core::swap(first.excluded_counts, second.excluded_counts);
}
/**
* Returns whether feature_set::features is `NULL` in the given `set`.
*/
static inline bool is_empty(const feature_set& set) {
return set.features == NULL;
}
/**
* Sets feature_set::features to `NULL` in the given `set`.
*/
static inline void set_empty(feature_set& set) {
set.features = NULL;
}
/**
* Sets feature_set::features to `NULL` in every element of the given array `sets`.
*/
static inline void set_empty(feature_set* sets, unsigned int length) {
for (unsigned int i = 0; i < length; i++)
sets[i].features = NULL;
}
/**
* Frees the underlying arrays in the given feature_set `set`.
*/
static inline void free(feature_set& set) {
set.free();
}
private:
inline bool initialize() {
features = (unsigned int*) malloc(sizeof(unsigned int) * feature_count);
if (features == NULL) {
fprintf(stderr, "feature_set.initialize ERROR: Insufficient memory for feature vector.\n");
return false;
}
excluded = (unsigned int**) malloc(sizeof(unsigned int*) * feature_count);
if (excluded == NULL) {
fprintf(stderr, "feature_set.initialize ERROR: Insufficient memory for excluded sets.\n");
core::free(features);
return false;
}
excluded_counts = (unsigned int*) calloc(sizeof(unsigned int), feature_count);
if (excluded_counts == NULL) {
fprintf(stderr, "feature_set.initialize ERROR: Insufficient memory for excluded count array.\n");
core::free(features); core::free(excluded);
return false;
}
return true;
}
inline bool initialize(const feature_set& src) {
if (!initialize()) return false;
memcpy(features, src.features, sizeof(unsigned int) * feature_count);
for (unsigned int i = 0; i < src.feature_count; i++) {
if (src.excluded_counts[i] > 0) {
excluded[i] = (unsigned int*) malloc(sizeof(unsigned int) * src.excluded_counts[i]);
if (excluded[i] == NULL) {
fprintf(stderr, "feature_set.initialize ERROR: Insufficient memory for excluded set at index %u.\n", i);
free(); return false;
}
memcpy(excluded[i], src.excluded[i], sizeof(unsigned int) * src.excluded_counts[i]);
excluded_counts[i] = src.excluded_counts[i];
}
}
return true;
}
inline void free() {
core::free(features);
for (unsigned int i = 0; i < feature_count; i++) {
if (excluded_counts[i] > 0)
core::free(excluded[i]);
}
core::free(excluded);
core::free(excluded_counts);
}
friend bool init(feature_set&, unsigned int);
friend bool init(feature_set&, const feature_set&);
};
/**
* Initializes the given feature_set `set` with the given `feature_count`. Each
* element in feature_set::features is uninitialized, whereas
* feature_set::excluded_counts is initialized to all zeros.
*/
inline bool init(feature_set& set, unsigned int feature_count) {
set.feature_count = feature_count;
return set.initialize();
}
/**
* Initializes the given feature_set `set` by copying from the given feature_set `src`.
*/
inline bool init(feature_set& set, const feature_set& src) {
set.feature_count = src.feature_count;
return set.initialize(src);
}
/**
* Returns whether the feature_set in `first` is equivalent to the one in
* `second`. For equivalence, the feature sets must have the same sequence of
* features, as well as the same set of excluded features at every index.
*/
inline bool operator == (const feature_set& first, const feature_set& second) {
if (first.features == NULL || first.feature_count != second.feature_count) return false;
for (unsigned int i = 0; i < first.feature_count; i++)
if (first.features[i] != second.features[i]) return false;
for (unsigned int i = 0; i < first.feature_count; i++) {
if (first.excluded_counts[i] != second.excluded_counts[i])
return false;
if (first.excluded_counts[i] > 0) {
for (unsigned int j = 0; j < first.excluded_counts[i]; j++)
if (first.excluded[i][j] != second.excluded[i][j])
return false;
}
}
return true;
}
/**
* A sorter for feature sets that enables lexicographical sorting according to
* the sequence of feature values.
*/
struct feature_set_sorter {
unsigned int depth;
/**
* Initializes the feature_set_sorter with the given `depth`. This
* feature_set_sorter sorts feature set objects that have length `depth - 1`.
*/
feature_set_sorter(unsigned int depth) : depth(depth) { }
};
/**
* Compares the given feature set objects `first` and `second` using the
* feature_set_sorter `sorter`.
* \tparam FeatureSet a feature set type that implements the function
* `T get_feature(unsigned int)` where `T` is any type that satisfies
* [LessThanComparable](https://en.cppreference.com/w/cpp/named_req/LessThanComparable).
*/
template<typename FeatureSet>
inline bool less_than(
const FeatureSet& first,
const FeatureSet& second,
const feature_set_sorter& sorter)
{
for (unsigned int i = 0; i < sorter.depth - 1; i++) {
if (first.get_feature(i) < second.get_feature(i))
return true;
else if (second.get_feature(i) < first.get_feature(i))
return false;
}
return false;
}
/**
* This structure represents a feature_set weighted by a numerical value, such
* as a probability.
* \tparam V satisfies [is_arithmetic](http://en.cppreference.com/w/cpp/types/is_arithmetic).
*/
template<typename V>
struct weighted_feature_set {
/**
* The underlying feature_set.
*/
feature_set features;
/**
* The weight of weighted_feature_set::features.
*/
V log_probability;
/**
* Returns the feature value at the given `index`.
*/
inline unsigned int get_feature(unsigned int index) const {
return features[index];
}
/**
* Sets the feature value at the given `index` to `feature`.
*/
inline void set_feature(unsigned int index, unsigned int feature) {
features[index] = feature;
}
/**
* Checks that the excluded array at `feature_set::excluded[index]` in
* `weighted_feature_set::features` has sufficient capacity. If that array
* is `NULL`, this function will initialize it (leaving its elements
* uninitialized).
*/
inline bool ensure_excluded_capacity(unsigned int index, unsigned int capacity) {
return features.ensure_excluded_capacity(index, capacity);
}
/**
* Initializes the excluded array at `feature_set::excluded[index]` in
* `weighted_feature_set::features` and copies the contents from `src`.
* This function assumes `feature_set::excluded[index]` is previously
* uninitialized.
*/
inline bool set_excluded(unsigned int index, const unsigned int* src, unsigned int count) {
return features.set_excluded(index, src, count);
}
/**
* Excludes the given item from the feature at the given `index`, without
* sorting the excluded array `feature_set::excluded[index]` in
* `weighted_feature_set::features`. `feature_set::excluded_counts[index]`
* is also incremented. This function assumes the excluded array does not
* already contain `item` and has sufficient capacity.
*/
inline void exclude_unsorted(unsigned int index, unsigned int item) {
return features.exclude_unsorted(index, item);
}
/**
* Sorts the excluded array at `feature_set::excluded[index]` in
* `weighted_feature_set::features`. This function assumes the excluded
* array is not `NULL`.
*/
inline void sort_excluded(unsigned int index) {
return features.sort_excluded(index);
}
/**
* Returns the natural exponent of the weight weighted_feature_set::log_probability.
*/
inline V get_probability() const {
return exp(log_probability);
}
/**
* Sets the weight weighted_feature_set::log_probability to the natural
* logarithm of the given `probability`.
*/
inline void set_probability(V probability) {
log_probability = log(probability);
}
/**
* Moves the given weighted_feature_set in `src` into `dst`.
*/
static inline void move(const weighted_feature_set<V>& src, weighted_feature_set<V>& dst) {
dst.log_probability = src.log_probability;
feature_set::move(src.features, dst.features);
}
/**
* Swaps the weighted_feature_set structures in `first` and `second`.
*/
static inline void swap(weighted_feature_set<V>& first, weighted_feature_set<V>& second) {
feature_set::swap(first.features, second.features);
core::swap(first.log_probability, second.log_probability);
}
/**
* Frees the given weighted_feature_set `set`.
*/
static inline void free(weighted_feature_set<V>& set) {
core::free(set.features);
}
};
/**
* Initializes the given weighted_feature_set `set` with the given
* `feature_count`. Each element in feature_set::features in
* weighted_feature_set::features is uninitialized, whereas
* feature_set::excluded_counts is initialized to all zeros. The weight
* weighted_feature_set::log_probability is uninitialized.
*/
template<typename V>
inline bool init(weighted_feature_set<V>& set, unsigned int feature_count) {
return init(set.features, feature_count);
}
/**
* Initializes the given weighted_feature_set `set` with the given
* feature_set `src` and weight `log_probability`.
*/
template<typename V>
inline bool init(weighted_feature_set<V>& set, const feature_set& src, V log_probability) {
set.log_probability = log_probability;
return init(set.features, src);
}
/**
* Returns whether weighted_feature_set::log_probability of `first` is *greater than* that of `second`.
*/
template<typename V>
inline bool operator < (const weighted_feature_set<V>& first, const weighted_feature_set<V>& second) {
return first.log_probability > second.log_probability;
}
/**
* Returns the weight weighted_feature_set::log_probability of `set`.
*/
template<typename V>
inline double log_probability(const weighted_feature_set<V>& set) {
return set.log_probability;
}
#endif /* FEATURES_H_ */