-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataprocess.py
40 lines (38 loc) · 1.1 KB
/
dataprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
import os.path
import nltk
# combine the training ham data into one .txt file
filepath='/Users/kejunchen/Desktop/hamdebug/'
filenames=os.listdir(filepath)
datalist = []
datapath = []
for i in filenames:
if os.path.splitext(i)[1] == '.txt':
datalist.append(i)
for text_file in datalist:
datapath.append(filepath+text_file)
with open('hamtrain.txt', 'w') as fh:
for text_file in datapath:
data = open(text_file, 'r')
fh.write(data.read())
fh.close()
# combine the training spam data into one .txt file
filepath='/Users/kejunchen/Desktop/spamdebug/'
filenames=os.listdir(filepath)
datalist = []
datapath = []
for i in filenames:
if os.path.splitext(i)[1] == '.txt':
datalist.append(i)
for text_file in datalist:
datapath.append(filepath+text_file)
with open('spamtrain.txt', 'w') as fh:
for text_file in datapath:
data = open(text_file, 'r')
fh.write(data.read())
fh.close()
with open('train.txt','w') as fh:
data = open("hamtrain.txt",'r')
fh.write(data.read())
data = open("spamtrain.txt", 'r')
fh.write(data.read())