-
Notifications
You must be signed in to change notification settings - Fork 3
/
compute_topic_signatures.py
132 lines (96 loc) · 4.07 KB
/
compute_topic_signatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
## Script to calculate topic signature (log-likelihood test)
import json
from collections import defaultdict
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
def compute_likelihood_ratio(c_1, c_12, c_2, p, p_1, p_2, N):
"""From `https://www.cs.cmu.edu/~hovy/papers/00linhovy.pdf`
"""
def log_L(k, n, x):
return k * np.log(x) + (n-k) * np.log(1 - x)
return log_L(c_12, c_1, p) \
+ log_L(c_2 - c_12, N - c_1, p) \
- log_L(c_12, c_1, p_1) \
- log_L(c_2 - c_12, N - c_1, p_2)
class TopicSignatureConstruction:
def __init__(self, data_path):
self.data_path = data_path
self.lemma_data = []
# word frequency for each document
self.doc2freq = dict()
# total number of words in each document
self.doc_total_words = dict()
# frequency for each unique token
self.total_freq = defaultdict(int)
# total number of occurred tokens
self.total_words = 0
# supply any stop word lists you want
self.stopwords = [ln.strip().lower() for ln in open('./stopwords.txt')]
def load_data(self):
"""
Load text data and run tokenization.
Assume each line in `self.data_path` is a json object, which contains:
`id`: unique document id
`text`: untokenized original document
"""
for ln in open(self.data_path):
cur_obj = json.loads(ln)
cur_id = cur_obj['id']
cur_text = cur_obj['text']
cur_words = word_tokenize(cur_text)
lowercased_words = []
# remove punctuations and stopwords
for word in cur_words:
word = word.lower()
if not str.isalnum(word):
continue
if word in self.stopwords:
continue
lowercased_words.append(word)
for word in lowercased_words:
self.total_freq[word] += 1
self.total_words += 1
if cur_id not in self.doc2freq:
self.doc2freq[cur_id] = defaultdict(int)
self.doc2freq[cur_id][word] += 1
if cur_id not in self.doc_total_words:
self.doc_total_words[cur_id] = 0
self.doc_total_words[cur_id] += 1
print(f'{len(self.doc_total_words)} documents loaded')
def calculate_llr(self):
"""Calculate log-likelihood ratio"""
self.doc_word2ratio = {doc_id: defaultdict(float) \
for doc_id in self.doc2freq}
N = self.total_words
for doc_id in tqdm(self.doc2freq):
for word in self.doc2freq[doc_id]:
if self.total_freq[word] < 10: continue
c_2 = self.total_freq[word]
p = c_2 / N
c_12 = self.doc2freq[doc_id][word]
if c_12 == 0:
continue
c_1 = self.doc_total_words[doc_id]
p_1 = c_12 / c_1
p_2 = (c_2 - c_12) / (N - c_1)
if c_2 == c_12:
cur_ratio = 0
else:
cur_ratio = -2 * compute_likelihood_ratio(c_1, c_12, c_2, p, p_1, p_2, N=N)
self.doc_word2ratio[doc_id][word] = cur_ratio
def write_to_disk(self):
fout = open('loglikelihood_ratio.jsonl', 'w')
for doc_id, w2ratio in self.doc_word2ratio.items():
ret_obj = {'id': doc_id, 'ratio_ranked_words': []}
for item in sorted(w2ratio.items(), key=lambda x: x[1], reverse=True):
output_tuple = (item[0], item[1], self.doc2freq[doc_id][item[0]])
ret_obj['ratio_ranked_words'].append(output_tuple)
fout.write(json.dumps(ret_obj) + '\n')
fout.close()
if __name__=='__main__':
DATA_PATH = 'demo.jsonl'
ts_construction = TopicSignatureConstruction(data_path=DATA_PATH)
ts_construction.load_data()
ts_construction.calculate_llr()
ts_construction.write_to_disk()