-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_word_stats.py
197 lines (163 loc) · 8.97 KB
/
extract_word_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import os
import glob
import math
import random
from collections import Counter, defaultdict
from scipy import stats
from utils import *
class WordStatsExtractor:
def __init__(self, coha_path, coca_path):
self.data_path_historical = coha_path # path to COHA text directory (containing decade subdirs)
self.data_path_modern = coca_path # path to COCA text directory (containing genre subdirs)
self.frequency_dict_historical = {} # {"total" : {word: frequency}, "1810s" : {word: frequency}, ...}
self.frequency_dict_modern = {} # {"total" : {word: frequency}}
self.capitalization_counter_dict = defaultdict(lambda: Counter()) # {word : {form: count}}
def extract_frequencies(self, vocabulary, data_split):
"""
Collecting word frequencies for the specified split ('historical' or 'modern')
:param vocabulary: vocabulary of nouns for analysis
:param data_split: 'historical' or 'modern' (corresponding to COHA and COCA respectively)
:return
"""
assert data_split == "historical" or data_split == "modern"
num_tokens_total = 0
counts_total = Counter()
if data_split == "historical":
data_dir = self.data_path_historical
# Historical data is COHA corpus up to 1989
subdirs = ["1810s", "1820s", "1830s", "1840s", "1850s",
"1860s", "1870s", "1880s", "1890s", "1900s",
"1910s", "1920s", "1930s", "1940s", "1950s",
"1960s", "1970s", "1980s"]
else:
data_dir = self.data_path_modern
# Modern data is entire COCA corpus
subdirs = ["text_academic_rpe", "text_fiction_awq",
"text_magazine_qch", "text_newspaper_lsp",
"text_spoken_kde"]
for subdir in subdirs:
current_dir = f"{data_dir}/{subdir}/"
if not os.path.exists(current_dir):
print(f"Missing subdirectory: {subdir}")
continue
print(f"Processing {current_dir}")
num_tokens_subdir = 0
counts_in_subdir = Counter()
for filename in glob.glob(current_dir + "*.txt"):
with open(filename, 'r') as fin:
for line in fin:
tokens = line.strip().split()
num_tokens_subdir += len(tokens)
for token in tokens:
word = token.lower()
if word in vocabulary:
counts_in_subdir[word] += 1
# For each word, we also count its occurrences in different capitalization forms
if data_split == "modern":
self.capitalization_counter_dict[word][token] += 1
if data_split == "historical":
self.frequency_dict_historical[subdir] = Utils.normalize(counts_in_subdir, num_tokens_subdir)
num_tokens_total += num_tokens_subdir
counts_total += counts_in_subdir
if data_split == "historical":
self.frequency_dict_historical["total"] = Utils.normalize(counts_total, num_tokens_total)
else:
self.frequency_dict_modern["total"] = Utils.normalize(counts_total, num_tokens_total)
def extract_neologisms(self, outfile):
"""
Extracting a list of neologisms, filtering by modern-historical frequency ratio, word length and capitalization
:param outfile: file path path to output neologisms
:return: list of extracted neologisms
"""
vocabulary = self.frequency_dict_modern["total"].keys()
neologism_counter = Counter()
for word in vocabulary:
# To be included as a neologism, the word has to occur most frequently in lowercase
most_common_form = self.capitalization_counter_dict[word].most_common(1)[0][0]
if most_common_form != word:
continue
freq_mod = self.frequency_dict_modern["total"][word]
freq_hist = self.frequency_dict_historical["total"][word]
# First, we select words that are MIN_FREQUENCY_RATIO times more frequent in modern than historical data
if freq_mod > 0 and (freq_hist == 0 or freq_mod / freq_hist > MIN_FREQUENCY_RATIO):
neologism_counter[word] = freq_mod
# We filter out words shorter than MIN_WORD_LEN characters, and only leave the first 1000
neologism_list = [n[0] for n in neologism_counter.most_common() if len(n[0]) >= MIN_WORD_LEN][:1000]
with open(outfile, 'w') as fout:
for word in neologism_list:
fout.write(f"{word}\n")
fout.flush()
return neologism_list
def extract_frequency_growth(self, vocabulary, outfile):
"""
For all words in the vocabulary, compute their frequency growth rate (Spearman correlation
between the decade time series and frequency time series by decade in the historical data)
:param vocabulary: vocabulary of nouns for analysis
:param outfile: file path to output words and their frequency growth rates
:return: word - frequency growth rate dictionary
"""
# Time steps enumerate decades from 1810s to 1880s
time_steps = list(range(1, 19))
fout = open(outfile, 'w')
frequency_growth_dict = {}
for word in vocabulary:
word_frequency_series = []
for key in sorted(self.frequency_dict_historical.keys())[:18]:
frequency = self.frequency_dict_historical[key].get(word, 0)
word_frequency_series.append(frequency)
if sum(word_frequency_series) > 0:
corr, pval = stats.spearmanr(time_steps, word_frequency_series)
fout.write(f"{word}\t{corr}\t{pval}\n")
frequency_growth_dict[word] = corr
fout.close()
return frequency_growth_dict
def pair_neologisms_with_controls(self, frequency_growth_dict, neologism_list, outfile,
stability_constraint=True, seed=None):
"""
Collecting a set of control words by pairing each neologism with a control counterpart,
controlling for overall frequency and word length
:param frequency_growth_dict: word - frequency growth rate dictionary
:param neologism_list: list of neologisms to pair
:param outfile: file path to output neologism - control word pairs
:param stability_constraint: toggles between 'stable' and 'relaxed' control sets (default = True)
:param seed: seed to randomize the relazed control sets (default = None)
:return: neologism - control word pair dictionary
"""
pairs_dict = {}
candidate_controls = []
frequencies_historical = self.frequency_dict_historical['total']
frequencies_modern = self.frequency_dict_modern['total']
for word, frequency_growth in frequency_growth_dict.items():
if word not in neologism_list:
if not stability_constraint:
candidate_controls.append(word)
if stability_constraint and math.fabs(float(frequency_growth)) < MAX_SPEARMANS_CORRELATION:
candidate_controls.append(word)
print(f"Found {len(candidate_controls)} candidate control words")
if seed is not None:
random.seed(seed)
random.shuffle(candidate_controls)
for neologism in neologism_list:
neologism_len = len(neologism)
neologism_freq = frequencies_modern.get(neologism, 0)
matched_flag = False
for control in candidate_controls:
control_len = len(control)
control_freq = frequencies_historical.get(control, 0)
if control_freq == 0:
continue
freq_ratio = neologism_freq / control_freq
# To replicate our results, one needs to restrict the frequency ratio to fall within (0.75, 1.33),
# rather than (0.75, 1.25) as reported in the paper
if abs(neologism_len - control_len) < 2 and 0.75 < freq_ratio < 1.33:
pairs_dict[neologism] = control
candidate_controls.remove(control)
matched_flag = True
break
if not matched_flag:
print(f"Failed to pair with a control: {neologism}")
print(f"Created {len(pairs_dict)} neologism-control pairs")
with open(outfile, 'w') as fout:
for neologism in pairs_dict.keys():
fout.write("\t".join([neologism, pairs_dict[neologism]]) + "\n")
return pairs_dict