-
Notifications
You must be signed in to change notification settings - Fork 5
/
parseFile.py
53 lines (49 loc) · 1.74 KB
/
parseFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import string
import re
from readableTextFile import ReadableTextFile
import json
def trimFileEdges(fileName="", relativePath = "", absolutePath = os.path.dirname(os.path.abspath(__file__))+"/",
tripWirePhrase = "From:", exitPhrase = "LZFu"):
'''Makes the assumption that it is being run in the same directory as the
message files, relative path can be passed as second argument, or absolute
path can be passed as a replacement for the third. In theory you
can also specify a path relative to a different absolute path, but
that seems a little silly to me.'''
with open(absolutePath+relativePath+fileName) as source:
workingFile = ([removeExtremelyStrangeCharacters(line) for line in source])
start = False
finished = False
relevantPortion = []
for line in workingFile:
if not start:
if tripWirePhrase in line:
relevantPortion.append(line)
start = True
else:
if exitPhrase in line:
break
else:
relevantPortion.append(line)
return relevantPortion
def removeExtremelyStrangeCharacters(s):
'''First level of filtering, expects raw file'''
return filter(lambda x: x in string.printable[0:98],s)
def wrapperFunction(filePaths, outputFile):
allData = []
for path in filePaths:
corpus = trimFileEdges(absolutePath=path)
test = ReadableTextFile()
test.parseMetaData(corpus)
test.parseMessageBody(corpus)
allData.append(test.data)
del test
with open(outputFile, 'wb') as dataDump:
for data in allData:
dataDump.write(json.dumps(data) + "\n")
relativePath = "../DesktopClient/GraphInterface/src/testData/Field2Emails/"
corpus = trimFileEdges("Follow-up.msg", relativePath)
test = ReadableTextFile()
test.parseMetaData(corpus)
test.parseMessageBody(corpus)
print json.dumps(test.data)