-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2vec.py
99 lines (79 loc) · 2.98 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
from multiprocessing import Pool
import gensim.downloader as gensim_api
import nltk
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
LABELS = "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
def load(path, max_lines=10000):
"""
Load a dataset.
"""
df = pd.read_csv(path)[-max_lines:] # 159571 lines
x = df['comment_text']
y = [df[i] for i in LABELS]
return x, y
def tokenize(comment: str):
# split into tokens
comment = word_tokenize(comment.lower())
# remove punctuation and stopwords
comment = [i for i in comment if i.isalpha() and i not in english_stopwords and i in corpus]
return comment
def vectorize(words: list):
"""
Return the arithmetic mean of vectors of all words.
"""
return np.mean(corpus[words], axis=0)
def process_chunk(comments: list, starting_index=0):
"""
Tokenize and vectorize the given comments.
(see preprocess)
"""
tokenized = [tokenize(comment) for comment in comments]
removed_indices = filter(lambda i: not bool(tokenized[i]), range(len(tokenized)))
removed_indices = [i + starting_index for i in removed_indices]
tokenized = filter(None, tokenized) # remove empty lists
vectorized = [vectorize(words) for words in tokenized]
return vectorized, removed_indices
def preprocess(comments: list, workers: int = None):
"""
Tokenize and Vectorize the given comments in parallel.
:param comments: List of strings of comments
:param workers: number of parallel processes (default: number of CPUs)
:return: vectors, indices of non-classifiable comments
"""
if workers is None:
workers = os.cpu_count()
chunk_size = len(comments) / workers
chunk_starts = [int(chunk_size * i) for i in range(workers)]
chunk_ends = chunk_starts[1:] + [len(comments)]
with Pool() as p:
results = []
for start, end in zip(chunk_starts, chunk_ends):
results.append(p.apply_async(
func=process_chunk,
args=(comments[start:end], start),
))
tokenized, removed_indices = [], []
for chunk in results:
comments, indices = chunk.get()
tokenized.extend(comments)
removed_indices.extend(indices)
return tokenized, removed_indices
print("Loading libraries...")
nltk.download("punkt")
nltk.download("stopwords")
if not os.path.exists(".gensim-cache"):
corpus = gensim_api.load("glove-twitter-200")
corpus.save(".gensim-cache")
else:
corpus = KeyedVectors.load(".gensim-cache", mmap="r")
english_stopwords = set(stopwords.words("english"))
if __name__ == "__main__":
x_train, y_train = load("train_pretty.csv", max_lines=159_571)
x_test, y_test = load("test_pretty.csv", max_lines=159_571)
x_train_tokenized, train_removed = preprocess(x_train)
x_test_tokenized, test_removed = preprocess(x_test)