-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_docs.py
68 lines (49 loc) · 2.57 KB
/
parse_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
from process_text import Process_text
class Parse_cacm:
def __init__(self, cacm_words_path, common_words_filename):
with open(cacm_words_path, 'r') as f:
self.lines = f.readlines()
self.current_line_number = 0
self.document_begin_regex = re.compile('^\.I\s(?P<id>\d*)')
self.category_markers = ['.I', '.T', '.W', '.B', '.A', '.N', '.X', '.K', '.C']
self.documents = []
self.text_processor = Process_text(common_words_filename)
def parse_file(self):
# -1 because we do all the iteration stuff at the end of the loop, instead of the beginning. Also, last line is void so we don't care.
while (self.current_line_number < len(self.lines)-1):
line = self.lines[self.current_line_number]
match = self.document_begin_regex.match(line)
# Parses the document. Stops on a new document declaration (line = .I \d+)
if(match):
self.process_document(int(match.group('id')))
else:
self.current_line_number += 1
# The index is of the form [(document_id, {word: tf, ...}), ...]
# print self.documents[47]
return self.documents
def process_document(self, document_id):
content = ''
self.current_line_number += 1
line = self.lines[self.current_line_number]
while(line[0:2] != '.I' and self.current_line_number < len(self.lines)-1): # We are still in the same document
# process_contents aggregates the content of a category (title, abstract, keywords).
# Stops on a new category declaration (lines[self.current_line_number] will start with .[ITWABNKC] after execution)
if(line[0:2] in ('.T', '.W', '.K')):
content += ' ' + self.process_content()
else:
self.current_line_number += 1
line = self.lines[self.current_line_number]
self.documents.append((document_id, self.text_processor.sanitize_rawtext_with_stats(content)))
# The document has been processed.
return None
def process_content(self):
self.current_line_number += 1
line = self.lines[self.current_line_number]
content = ''
while(line[0:2] not in self.category_markers and self.current_line_number < len(self.lines)-1):
# content will be multiline, but we don't care since we will process all the words individually
content += line
self.current_line_number += 1
line = self.lines[self.current_line_number]
return content