-
Notifications
You must be signed in to change notification settings - Fork 3
/
Corp.py
69 lines (61 loc) · 2.82 KB
/
Corp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# Corp.py
# Memory-friendly ways to deal with json files
from HTMLParser import HTMLParser
from gensim import corpora
import string
import json
# These are words that will be removed from posts, due to their frequency and poor utility in distinguishing between topics
stop_words = ["a","able","about","across","after","all","almost","also","am","among","an","and","any","are","as","at","be","because","been","but","by","can","cannot","could","did","do","does","either","else","ever","every","for","from","get","got","had","has","have","he","her","hers","him","his","how","however","i","if","in","into","is","it","its","just","least","let","like","may","me","might","most","must","my","neither","no","nor","not","of","off","often","on","only","or","other","our","own","rather","said","say","says","she","should","since","so","some","than","that","the","their","them","then","there","these","they","this","to","too","us","wants","was","we","were","what","when","where","which","while","who","whom","why","will","with","would","yet","you","your"]
# Tools for stripping html
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
# An object to read and parse files without loading them entirely into memory
class Files():
def __init__(self, files):
self.files = files
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def __iter__(self): # Read only one line at a time from the text files, to be memory friendly
for f in self.files:
f.seek(0) # Reset the file pointer before a new iteration
for line in f:
post = json.loads(line)
content = post["content"]
doc_words = []
try: # parse and split the content up into a list of lower-case words
doc_words = strip_tags(content).encode('ascii', 'ignore').translate(string.maketrans("",""), string.punctuation).lower().split()
except: # Fails on some nasty unicode
doc_words = []
yield doc_words
def __len__(self):
n = 0
for f in self.files:
f.seek(0)
for line in f:
n += 1
return n
def close(self):
for f in self.files:
f.close()
# A helper class, for use in gensim's LDA implementation
class Corp():
def __init__(self, files, dic):
self.files = files
self.dic = dic
def __iter__(self):
for doc in self.files:
yield self.dic.doc2bow(doc)
def __len__(self):
return len(self.files)