-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocess_memeorandum_pages.py
executable file
·35 lines (32 loc) · 1.21 KB
/
process_memeorandum_pages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import nltk
from readability.readability import Document
import os
import sys
def clean_html_directory(source_directory, target_directory):
"""Retains only relevant article text from articles in
source directory and saves text only files in targe directory.
Arguments:
- `source_directory`: directory with .html files
- `target_directory`: directory to save files in
"""
if not os.path.exists(target_directory):
os.makedirs(target_directory)
html_files = os.listdir(source_directory)
html_files.sort()
print "Processing {0} files".format(len(html_files))
for f in html_files:
html = open(source_directory+f)
html_text = html.read()
relevant = Document(html_text).summary()
cleaned = nltk.clean_html(relevant)
# some reason carriage returns still there
cleaned = cleaned.replace(' ', ' ')
save_f = f.split('.')[0]
output = open(target_directory+save_f, 'w')
output.write(cleaned.encode('utf-8', 'ignore'))
output.close()
if __name__ == '__main__':
if len(sys.argv) < 3:
print "Error: Expected source and target directory"
else:
clean_html_directory(sys.argv[1], sys.argv[2])