-
Notifications
You must be signed in to change notification settings - Fork 0
/
artifacts.py
176 lines (131 loc) · 5.13 KB
/
artifacts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import json
from pprint import pprint
import random
from os import listdir
from os.path import isfile, join
import io
import logging
import re
import string
import spacy
nlp = spacy.load('en')
# Reading and processing data
from collections import Counter
import sys
categories = Counter()
claims = {"SUPPORTS": [], "REFUTES": [], "NOT ENOUGH INFO": []}
with open('data/train.jsonl') as f:
for i, line in enumerate(f):
if i % 10000 == 0:
sys.stdout.write('.')
content = json.loads(line)
claim = content['claim']
label = content['label']
categories[label] += 1
claims[label].append(claim)
claim_docs = dict()
for k in claims.keys():
processed = list(nlp.pipe(claims[k], n_threads=40, batch_size=20000))
claim_docs[k] = processed
claim_vocab = {"SUPPORTS": Counter(), "REFUTES": Counter(), "NOT ENOUGH INFO": Counter()}
claim_len = {"SUPPORTS": Counter(), "REFUTES": Counter(), "NOT ENOUGH INFO": Counter()}
# vocab count of all claims regardless of class
total_vocab = Counter()
total_word_count = 0
for cat in claim_docs.keys():
for doc in claim_docs[cat]:
claim_len[cat][len(doc)] += 1
total_word_count += len(doc)
for token in doc:
claim_vocab[cat][token.lemma_] += 1
total_vocab[token.lemma_] += 1
total_word_count == len(list(total_vocab.elements())) == 1377880
# Claim sentence length (by tokens)
# mean sent len
%matplotlib inline
import matplotlib.pyplot as plt
import numpy
plot_args = []
colors = ['r', 'b', 'g']
stats = []
for i, k in enumerate(claim_len.keys()):
count = claim_len[k]
elem = list(count.elements())
stats.append([k, numpy.mean(elem), numpy.std(elem), min(elem), max(elem)])
#print(" | ".join(list(map(str, [k, numpy.mean(elem), numpy.std(elem), min(elem), max(elem)]))))
pmf, bins = numpy.histogram(elem, bins=range(0, 40), density=True)
plt.plot(bins[:-1], pmf, colors[i], label=k)
plt.xlabel("sentence length (# tokens)")
plt.ylabel("probability of sentence length")
plt.legend(loc='upper right')
plt.show()
# PMI between each word and class in the training set
# calculate PMI
def normalize_counter(oldcount, total):
count = dict()
for key in oldcount.keys():
count[key] = oldcount[key]/total
return count
# a dictionary mapping a class to probability of a word in that class
norm_vocab = dict()
# probability of a class
#for k in ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]:
# vocab = claim_vocab[k]
# claim_vocab[k] = {k:v for (k,v) in vocab.items()}
#total_word_count = sum([sum(vocab.values()) for vocab in claim_vocab.values()])
for k in ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]:
norm_vocab[k] = normalize_counter(claim_vocab[k], total_word_count)
# probability of a word across all classes
norm_total_vocab = normalize_counter(total_vocab, total_word_count)
# should all sum to 1
sum(norm_total_vocab.values())
sum(norm_cat.values())
# sum([sum(v.values()) for v in norm_vocab.values()])
import math
pmi = dict()
for k in ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]:
pmi[k] = dict()
for word in claim_vocab[k].keys():
pword_class = claim_vocab[k][word] / total_word_count
pword = total_vocab[word]/total_word_count
pmi[k][word] = pword_class/(pword * pclass[k])
print("Words with most PMI in each class")
from pprint import pprint
top_pmi = []
import operator
import csv
with open("pmi.tsv", "w") as f:
writer = csv.writer(f, delimiter="\t")
writer.writerow(["Class", "Word", "P(class)", "P(word)", "P(word, class)", "pmi", "Count", "Class count"])
for k in ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]:
sort = sorted(pmi[k].items(), key=operator.itemgetter(1), reverse=True)
tops_class = list(map(list, sort[:10000]))
top_pmi.append(tops_class)
for word in tops_class:
word = word[0]
if (k == "NOT ENOUGH INFO"):
key = "NEI"
else:
key = k
writer.writerow([key, word, round(pclass[k],4), round(norm_total_vocab[word], 10),
round(norm_vocab[k][word],10),
round(pmi[k][word], 2),
round(total_vocab[word], 2),
round(claim_vocab[k][word], 2)] )
set([value for (word, value) in pmi["REFUTES"].items() if value > pmi["REFUTES"]["only"]])
# PMI of negation words
for k in ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]:
for word in ["not", "never", "nor", "none", "null"]:
try:
print(k, word, pmi[k][word])
except KeyError:
print(k, word, "never occured")
print("")
# PMI of temporal words
for k in ["SUPPORTS", "REFUTES", "NOT ENOUGH INFO"]:
for word in ["often", "always", "rarely", "frequently", "often", "regularly", "sometimes", "usually"]:
try:
print(k, word, pmi[k][word])
except KeyError:
print(k, word, "never occured")
print("")