forked from google/centipede
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathshard_reader.h
79 lines (72 loc) · 3.2 KB
/
shard_reader.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
// Copyright 2022 The Centipede Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef THIRD_PARTY_CENTIPEDE_SHARD_READER_H_
#define THIRD_PARTY_CENTIPEDE_SHARD_READER_H_
#include <string_view>
#include "absl/container/flat_hash_map.h"
#include "./blob_file.h"
#include "./defs.h"
#include "./feature.h"
#include "./util.h"
namespace centipede {
// `corpus_path` is a path to a BlobFile with corpus elements (inputs).
// `features_path` is a path to a BlobFile with {features/hash} pairs created by
// PackFeaturesAndHash.
// For every {features/hash} pair we need to find an input with this hash.
// This function reads `corpus_path` and `features_path` and calls `callback`
// on every pair {input, features}.
// If features are not found for a given input, callback's 2nd argument is {}.
// If features are found for a given input but are empty,
// then callback's 2n-d argument is {feature_domains::kNoFeature}.
template <typename CallBack>
void ReadShard(std::string_view corpus_path, std::string_view features_path,
CallBack callback) {
// Maps features to input's hash.
absl::flat_hash_map<std::string, FeatureVec> hash_to_features;
// Read all features, populate hash_to_features.
{
auto features_reader = DefaultBlobFileReaderFactory();
features_reader->Open(features_path).IgnoreError(); // File may not exist.
absl::Span<uint8_t> hash_and_features;
while (features_reader->Read(hash_and_features).ok()) {
// Every valid feature record must contain the hash at the end.
// Ignore this record if it is too short.
if (hash_and_features.size() < kHashLen) continue;
// Extract the hash.
std::string hash;
hash.insert(hash.end(), hash_and_features.end() - kHashLen,
hash_and_features.end());
// Extract the features, put them into hash_to_features.
size_t num_feature_bytes = hash_and_features.size() - kHashLen;
if (num_feature_bytes == 0) {
// Special case: zero features.
hash_to_features[hash] = {feature_domains::kNoFeature};
continue;
}
FeatureVec features(num_feature_bytes / sizeof(feature_t));
memcpy(features.data(), hash_and_features.data(),
features.size() * sizeof(feature_t));
hash_to_features[hash] = features;
}
}
// Read the corpus. Call `callback` for every {input, features} pair.
auto corpus_reader = DefaultBlobFileReaderFactory();
corpus_reader->Open(corpus_path).IgnoreError(); // File may not exist.
absl::Span<uint8_t> blob;
while (corpus_reader->Read(blob).ok()) {
callback(ByteArray(blob.begin(), blob.end()), hash_to_features[Hash(blob)]);
}
}
} // namespace centipede
#endif // THIRD_PARTY_CENTIPEDE_SHARD_READER_H_