-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathZapposParser.py
121 lines (94 loc) · 2.71 KB
/
ZapposParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# Prepares a zappos dataset to be sent to Weka
import csv
import time
import re
from porterStemmer import StemSentence
dataset = 'allzappostext' #Change this to point to your own text file
def sortedDictValues(adict,i):
FreqWriter = csv.writer(open('wordFreq.csv', 'w'), delimiter=',', quotechar='"')
f = open('wordFreq.csv', 'w')
for wordPair in adict:
myRow = ''.join([str(wordPair), ",", str(adict[wordPair]),"\n"])
f.write(myRow)
return
f.close()
start = time.clock()
print "Starting..."
TextReader = open(dataset +'.txt', 'rb')
wordCounts = {}
rowCounter = 0
documents = []
myText = ""
for row in TextReader:
rowCounter = rowCounter + 1
myText = ''.join(StemSentence(row))
DocWords =myText.split(" ")
if rowCounter%100 == 0:
print "Processing line: ", rowCounter
DocWordsCleaned = ""
for word in DocWords:
#rules for removing punctuation
p = re.compile('[^a-zA-Z\']+')
word = p.sub(' ', word)
DocWordsCleaned = "".join([DocWordsCleaned, " ", word])
DocWords = DocWordsCleaned.split(" ")
for word in DocWords:
#Counting
if word.lower() in wordCounts:
wordCounts[word.lower()] = wordCounts[word.lower()]+1
else:
wordCounts[word.lower()] =1
sortedDictValues(wordCounts, rowCounter)
elapsed = (time.clock() - start)
print elapsed
#print wordCounts.keys()
wordKeys = wordCounts.keys()
stopWords = ('a', 'the', 'an' , 'and', 'of', 'I')
nagationWords = ( 'not' , 'no' , 'don\'t' ,'cannot', 'declined' ,
'denied' , 'denies' , 'unremarkable', 'without' )
dataset = 'mytextfile' #Change this to point to your own text file
TextReader = csv.reader(open('zapposdesc.csv', 'rb'), delimiter=',', quotechar='"')
start = time.clock()
print "Starting... featurization"
#TextReader = open(dataset +'.txt', 'rb')
rowCounter = 0
f = open("featres.csv", "w")
for row in TextReader:
#print row[0]
wordCounts = [0]*len(wordKeys)
rowCounter = rowCounter + 1
row = (row[0])
myText = ''.join(StemSentence(row))
#print myText
DocWords =myText.split(" ")
DocWordsCleaned = ""
for word in DocWords:
#rules for removing punctuation
p = re.compile('[^a-zA-Z\']+')
word = p.sub(' ', word)
DocWordsCleaned = "".join([DocWordsCleaned, " ", word])
DocWords = DocWordsCleaned.split(" ")
prevWord = ""
for word in DocWords:
if word in stopWords:
continue
#Counting
increament = 1
if prevWord in nagationWords:
increament = -1
try:
wordIndex = wordKeys.index(word.lower())
wordCounts[wordIndex] = wordCounts[wordIndex]+increament
except:
print word.lower()
print row[0]
prevWord = word
#print StemSentence(row), "-->", wordCounts
#print " "
#print " "
f.write(','.join(str(wordCounts)))
f.write('\n')
print " "
f.close()
elapsed = (time.clock() - start)
print elapsed