forked from tillneu/bitcoin-clusterer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
histcluster.cc
159 lines (124 loc) · 4.23 KB
/
histcluster.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#include <iostream>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <math.h>
#include "hdr/data.h"
using namespace std;
string makeNiceNumber(double x, size_t maxN = 100000) {
size_t n = (size_t) x;
stringstream ss;
if (n <= 100000) {
string numWithCommas = to_string(n);
int insertPosition = numWithCommas.length() - 3;
while (insertPosition > 0) {
numWithCommas.insert(insertPosition, ",");
insertPosition-=3;
}
return numWithCommas;
} else {
ss << "$10^{" << (int) round(log(n)/log(10)) <<"}$";
}
return ss.str();
}
template<typename T>
vector<pair<int, pair<double, double>>> genLogHist(const vector<T>& values, double logbase, T max = 0) {
vector<pair<int, pair<double, double>>> hist;
if (max == 0) {
for (const auto v : values) {
if (v > max)
max = v;
}
}
size_t nbins = (size_t) (log(max)/log(logbase) + 1);
hist.resize(nbins);
for (const auto v : values) {
size_t pos = (size_t) (log(v)/log(logbase));
if (pos >= hist.size()) {
cerr << "value too large " << v << " -> " << log(v)/log(logbase) << " -> " << pos << endl;
continue;
}
hist[pos].first++;
}
for (size_t i = 0; i < nbins; ++i) {
hist[i].second.first = pow(logbase, i); // this value is part of the bin [
hist[i].second.second = pow(logbase, i+1); // this value is NOT part of the bin )
}
return hist;
}
void readClusters(const string& fname, const string& foutname, const int logbase) {
vector<int> addrcounts;
size_t maxsize = 0;
size_t ccount;
size_t addresses = 0;
size_t transactions = 0;
unsigned char addr[ADDRBYTES];
unsigned char hash[HASHBYTES];
int blockid;
ifstream file(fname, ios::binary);
ofstream fout(foutname);
file.read((char*)&ccount, sizeof(size_t));
fout << "# number of clusters: " << ccount << endl << endl;
for (size_t i = 0; i < ccount; ++i) {
if (i % 1000000 == 0) {
cout << i << "\t" << i/(double)ccount*100 << "%" << endl;
}
size_t csize;
file.read((char*)&csize, sizeof(size_t));
addrcounts.push_back(csize);
if (csize > maxsize) {
maxsize = csize;
}
LOG(csize << " Addresses:" << endl);
for (size_t j = 0; j < csize; ++j) {
++addresses;
file.read((char*)addr, ADDRBYTES);
LOG("\t" << EncodeBase58(addr, addr+ADDRBYTES) << endl);
}
size_t tsize;
file.read((char*)&tsize, sizeof(size_t));
LOG(tsize << " Transactions:" << endl);
for (size_t j = 0; j < tsize; ++j) {
++transactions;
file.read((char*)hash, HASHBYTES);
file.read((char*)&blockid, sizeof(int32_t));
LOG("\t\t" << Hash2String(hash) << " in block " << blockid << endl);
}
}
file.close();
size_t quantil = (addrcounts.size()*9)/10;
nth_element(addrcounts.begin(), addrcounts.begin() + quantil, addrcounts.end());
fout << "# total number addresses: " << addresses << endl;
fout << "# total number transactions: " << transactions << endl << endl;
fout << "# avg size of clusters: " << addresses/(double)ccount << endl;
fout << "# 99% quantil size of clusters: " << addrcounts[quantil] << endl << endl;
fout << "# size of largest cluster: " << maxsize << endl;
size_t ones = 0;
for (const auto v : addrcounts) {
if (v == 1)
++ones;
}
fout << "# number of clusters with size one: " << ones << endl;
fout << "# table: "<< fname << "\t" << ccount << " & " << addresses/(double)ccount << " & "
<< addrcounts[quantil] << " & " << maxsize << " & " << ones << "\\\\" << endl;
fout << endl;
auto hist = genLogHist<int>(addrcounts, logbase, maxsize);
fout << "# format: count\tlower bound included[\tupper bound excluded)" << endl;
for (const auto& v : hist) {
fout << v.first << "\t" << v.second.first << "\t" << v.second.second
<< "\t$\\\\lbrack$" << makeNiceNumber(v.second.first) << "-" << makeNiceNumber(v.second.second) << ")" << endl;
}
fout.close();
}
int main(int argc, char* argv[]) {
if (argc < 2) {
cout << "Generates a histogram of the cluster sizes for a cluster file created by ./clusterize" << endl;
cout << "USAGE: ./histcluster <clusters.out.xxx> <logbase>" << endl;
cout << "EXAMPLE: ./histcluster clusters.out.h23.hValue1.hGrowth20.raw 10" << endl;
cout << "the output is written to clusters.out.xxx.gpd" << endl;
return 1;
}
stringstream ss;
ss << argv[1] << ".gpd";
readClusters(argv[1], ss.str(), atoi(argv[2]));
}