-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweightings.py
55 lines (37 loc) · 1018 Bytes
/
weightings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
"""
Weightings
==========
Use if you have a raw co-occurrence matrix
in scipy.sparse.csr_matrix data structure
(see the subsection 1.4.2)
"""
import numpy as np
def raising(m, coeff=0.75):
m.data **= coeff
def log(m):
m.data = np.log(1 + m.data)
def ppmi(m):
all_sum = m.sum()
row_sums = np.array(m.sum(axis=1))[:, 0]
col_sums = np.array(m.sum(axis=0))[0, :]
m.data *= all_sum
denom = col_sums[m.indices]
#
for i in range(m.shape[0]):
beg, end = m.indptr[i], m.indptr[i + 1]
denom[beg:end] *= row_sums[i]
#
m.data /= 0.00001 + denom
m.data = np.log(m.data).clip(min=0.0)
def log_dice(m):
row_sums = np.array(m.sum(axis=1))[:, 0]
col_sums = np.array(m.sum(axis=0))[0, :]
m.data *= 2
denom = col_sums[m.indices] + 0.00001
#
for i in range(m.shape[0]):
beg, end = m.indptr[i], m.indptr[i + 1]
denom[beg:end] += row_sums[i]
#
m.data /= denom
m.data = (14 + np.log2(m.data)).clip(min=0.0)