-
Notifications
You must be signed in to change notification settings - Fork 1
/
loadFeatures.py
127 lines (98 loc) · 3.77 KB
/
loadFeatures.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# loadFeatures.py
# -----------------
# Main file for parsing data from the MNIST dataset, and passing it onto the
# machine learning algorithms.
#
# Chet Aldrich, Laura Biester
from mnist import *
from util import Counter
from progressBar import ProgressBar
import random
def loadTrainingData(n=None, pixels=0, tune=False):
"""
loadTrainingData() pulls training data from MNIST training set, splits it into training and
validation data, then parses the data into features
"""
# load data from MNIST files
images, labels = load_mnist('training')
random.seed(5)
imageLabels = zip(images, labels)
random.shuffle(imageLabels)
imageLabels = [list(t) for t in zip(*imageLabels)]
images, labels = imageLabels[0], imageLabels[1]
# split off 100 training instances if we are tuning
split = -100 if tune else len(labels)
# split training and validation images/labels
trainingImages, trainingLabels = images[:split], labels[:split]
validationImages, validationLabels = images[split:], labels[split:]
# get point where we should split to have n training images
n = n if n <= len(trainingLabels) else len(trainingLabels)
trainingImages, trainingLabels = trainingImages[:n], trainingLabels[:n]
# get features for data
trainingData, trainingFeatures = defineFeatures(trainingImages, pixels)
validationData, validationFeatures = defineFeatures(validationImages, pixels)
return trainingData, trainingLabels, validationData, validationLabels, trainingFeatures
def loadTestingData(n=None, chop=0, tune=False):
"""
loadTestingData() pulls testing data from MNIST training set,
then parses the data into features
"""
# load data from MNIST files
images, labels = load_mnist('testing')
# randomize data to be tested on
random.seed(5)
imageLabels = zip(images, labels)
random.shuffle(imageLabels)
imageLabels = [list(t) for t in zip(*imageLabels)]
images, labels = imageLabels[0], imageLabels[1]
# only return n data points
if n:
images = images[:n]
labels = labels[:n]
# get features for data
testingData = defineFeatures(images, chop)[0]
return testingData, labels
def defineFeatures(imageList, chop):
"""
defineFeatures() defines a simple feature of a pixel either being white (0)
or not (1) for a list of images and pixel values
chops off pixels on outside of image for faster (but less accurate) classification
"""
featureList = []
features = []
progressBar = ProgressBar(100, len(imageList), "Getting Features for Images")
for index, image in enumerate(imageList):
# update progress
progressBar.update(index)
# create feature of on/off for (x, y) positions in image
imgFeature = Counter()
for x in range(chop, len(image) - chop):
for y in range(chop, len(image[x]) - chop):
if image[x][y] == 0:
imgFeature[(x, y)] = 0
else:
imgFeature[(x, y)] = 1
featureList.append(imgFeature)
progressBar.clear()
# Here, we create a list of all of the features for use in the
# perceptron and Naive Bayes classifiers.
if len(imageList) > 0:
image = imageList[0]
for x in range(chop, len(image) - chop):
for y in range(chop, len(image[x]) - chop):
features.append((x,y))
return featureList, features
def getFeatureList():
"""
getFeatureList() returns the list of features necessary for the naive bayes
classifier with pre-trained probabilities
"""
features = []
for x in range(28):
for y in range(28):
features.append((x,y))
return features
def main():
loadData()
if __name__=="__main__":
main()