-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdistance.c
135 lines (129 loc) · 4.05 KB
/
distance.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
const long long max_size = 2000; // max length of strings
const long long N = 40; // number of closest words that will be shown
const long long max_w = 50; // max length of vocabulary entries
// run by ./distance intm/vector_qnum.bin intm/nw_qnum.txt word
// appends top N words to file intm/nw_qnum.txt
int main(int argc, char **argv) {
FILE *f;
char st1[max_size];
char outfile[max_size];
char *bestw[N];
char file_name[max_size], st[100][max_size];
float dist, len, bestd[N], vec[max_size];
long long words, size, a, b, c, d, cn, bi[100];
char ch;
float *M;
char *vocab;
if (argc < 3) {
printf("Usage: ./distance <FILE> <word>\nwhere FILE contains word projections in the BINARY FORMAT\n");
return 0;
}
strcpy(file_name, argv[1]);
f = fopen(file_name, "rb");
if (f == NULL) {
printf("Input file not found\n");
return -1;
}
fscanf(f, "%lld", &words);
fscanf(f, "%lld", &size);
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
M = (float *)malloc((long long)words * (long long)size * sizeof(float));
if (M == NULL) {
printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
return -1;
}
for (b = 0; b < words; b++) {
a = 0;
while (1) {
vocab[b * max_w + a] = fgetc(f);
if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
}
vocab[b * max_w + a] = 0;
for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f);
len = 0;
for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
len = sqrt(len);
for (a = 0; a < size; a++) M[a + b * size] /= len;
}
fclose(f);
for (a = 0; a < N; a++) bestd[a] = 0;
for (a = 0; a < N; a++) bestw[a][0] = 0;
a = 0;
strcpy(st1, argv[3]);
strcpy(outfile, argv[2]);
cn = 0;
b = 0;
c = 0;
while (1) {
st[cn][b] = st1[c];
b++;
c++;
st[cn][b] = 0;
if (st1[c] == 0) break;
if (st1[c] == ' ') {
cn++;
b = 0;
c++;
}
}
cn++;
for (a = 0; a < cn; a++) {
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
if (b == words) b = -1;
bi[a] = b;
printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
if (b == -1) {
printf("Out of dictionary word!\n");
break;
}
}
if (b >= 0){
printf("\n Word Cosine distance\n------------------------------------------------------------------------\n");
for (a = 0; a < size; a++) vec[a] = 0;
for (b = 0; b < cn; b++) {
if (bi[b] == -1) continue;
for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
}
len = 0;
for (a = 0; a < size; a++) len += vec[a] * vec[a];
len = sqrt(len);
for (a = 0; a < size; a++) vec[a] /= len;
for (a = 0; a < N; a++) bestd[a] = -1;
for (a = 0; a < N; a++) bestw[a][0] = 0;
for (c = 0; c < words; c++) {
a = 0;
for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
if (a == 1) continue;
dist = 0;
for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
for (a = 0; a < N; a++) {
if (dist > bestd[a]) {
for (d = N - 1; d > a; d--) {
bestd[d] = bestd[d - 1];
strcpy(bestw[d], bestw[d - 1]);
}
bestd[a] = dist;
strcpy(bestw[a], &vocab[c * max_w]);
break;
}
}
}
FILE *file;
file = fopen(outfile, "a");
if (file == NULL) {
perror("Error opening file");
return 1; // Exit with an error code
}
for (a = 0; a < N; a++){
fprintf(file, "%50s\t\t%f\n", bestw[a], bestd[a]);
}
fclose(file);
}
return 0;
}