-
Notifications
You must be signed in to change notification settings - Fork 0
/
digitDecision.rb
424 lines (354 loc) · 8.83 KB
/
digitDecision.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
require 'csv'
$recursiveCalls = 0
class Example
def initialize(features)
@features = features
@outcome = nil
end
def features
@features
end
def features=(value)
@features = value
end
def outcome
@outcome
end
def outcome=(value)
@outcome = value
end
def to_s
puts self.features.to_s
puts self.outcome.to_s
end
end
class ExampleSet
def initialize(examples)
@examples = examples
@featureCount = 0
@tally = Hash.new
end
def examples
@examples
end
def examples=(value)
@examples = value
end
def tally
@tally
end
def tally=(value)
@tally = value
end
def featureCount
@featureCount
end
def featureCount=(value)
@featureCount = value
end
def calculate
possibleOutcomes = Array.new
self.examples.each do |e|
possibleOutcomes.push(e.outcome)
end
possibleOutcomes.uniq!
possibleOutcomes.each do |o|
self.tally[o] = 0
end
self.examples.each do |e|
self.tally[e.outcome] += 1
end
self.featureCount = self.examples.first.features.size
end
def freq(outcome)
total = 0
self.tally.each_value {|value| total += value}
return self.tally[outcome] / total.to_f
end
def info
total = 0
info = 0
self.tally.each_value {|value| total += value}
self.tally.each_key {|key| info += ((self.freq(key)/total) * Math::log(self.freq(key)/total, 2))}
if info.nan?
return 0
else
return -info
end
end
def featureInfo(feature)
#features is used to calculate the number of different attributes
features = Array.new
#groups is an array used to store the results of splitting on each attribute
groups = Array.new
self.examples.each do |e|
features.push(e.features[feature])
end
features.uniq!
features.each do |f|
#create a new example set that matches this feature attribute
examples = Array.new
self.examples.each do |example|
if example.features[feature] == f
examples.push(example)
end
end
if examples != nil
currentExampleSet = ExampleSet.new(examples)
groups.push(currentExampleSet)
end
end
info = 0
numExamples = 0
groups.each do |item|
item.calculate
item.tally.each_value {|value| numExamples += value}
end
groups.each do |i|
total = 0
i.tally.each_value {|value| total += value}
info += (total.to_f / numExamples) * i.info
end
return info
end
# Feature is an integer defining which feature to calculate
# information gain for
def gain(feature)
return self.info - self.featureInfo(feature)
end
# Returns a boolean value that determines whether the example set contains values of
# only one type of outcome.
def pure?
outcomes = Array.new
self.examples.each do |e|
outcomes.push(e.outcome)
end
outcomes.uniq!
if outcomes.size == 1
return true
else
return false
end
end
# Determines the best feature to split on based on finding
# the feature with the most information gain
def splitOn?
maxGain = 0
bestFeature = nil
for i in 0..self.featureCount-1
gain = self.gain(i)
if gain >= maxGain
maxGain = gain
bestFeature = i
end
end
if maxGain == 0
bestFeature = rand(0..self.featureCount-1)
end
return bestFeature
end
def getValues(field)
values = Array.new
self.examples.each do |e|
values.push(e.features[field])
end
values.uniq!
return values
end
# returns a portion of the set that has a matching value at the given
# field
def giveMe(field, value)
examples = Array.new
self.examples.each do |e|
if e.features[field] == value
examples.push(e)
end
end
set = ExampleSet.new(examples)
set.calculate
return set
end
def to_s
self.tally.each {|key, value| puts key.to_s + " ** " + value.to_s }
end
end
class DecisionTreeNode
def initialize(field, outcome, exampleHash)
@field = field
@outcome = outcome
@splitOnSoFar = Array.new
@exampleHash = exampleHash
end
def field
@field
end
def outcome
@outcome
end
def exampleHash
@exampleHash
end
def field=(value)
@field = value
end
def outcome=(value)
@outcome = value
end
def exampleHash=(value)
@exampleHash=(value)
end
def to_s
puts "-------------------"
if field != nil
puts "Attr: " + field.to_s
end
if outcome != nil
puts "Outcome: " + outcome.to_s
end
puts "-------------------"
end
end
def buildTree(set)
$recursiveCalls += 1
if set.examples.empty?
node = DecisionTreeNode.new(nil, "Failure", nil)
puts "failure"
return node
elsif set.pure?
node = DecisionTreeNode.new(nil, set.examples.first.outcome, nil)
puts "pure set"
return node
else
currentSplit = set.splitOn?
puts "splitting on " + currentSplit.to_s
hash = Hash.new
branches = set.getValues(currentSplit)
branches.each do |b|
exampleSet = set.giveMe(currentSplit, b)
hash[b] = buildTree(exampleSet)
end
node = DecisionTreeNode.new(currentSplit, nil, hash)
return node
end
end
def decide(node, example)
if node != nil
if node.outcome == "Failure"
puts "Failed to Classify"
return nil
elsif node.field == nil
example.outcome = node.outcome
#puts "Decided: " + node.outcome.to_s
return node.outcome
else
if node.exampleHash.has_key?(example.features[node.field])
decide(node.exampleHash[example.features[node.field]], example)
else
puts "We'll need to make an estimate, no prior information about this one."
return rand(0..9)
end
end
else puts "nil node"
end
end
# Will assume that the last value of each line is the actual intended outcome
# in order to determine the success of the tree
def analyzeTestSet(testFile, tree, outcomeField)
correctCount = 0
incorrectCount = 0
CSV.foreach(testFile) do |row|
outcome = nil
featureValues = Array.new
for i in 0..row.size-1
if (i == outcomeField)
outcome = row[i]
else
featureValues.push(row[i].to_i)
end
end
example = Example.new(featureValues)
example.outcome = outcome
decision = decide(tree, example)
if outcome.eql?(decision)
puts "CORRECT: " + outcome.to_s + " " + decision.to_s
correctCount += 1
else
puts "INCORRECT"
incorrectCount += 1
end
end
percentage = (correctCount.to_f / (correctCount + incorrectCount))
puts "Accuracy: " + percentage.to_s + " on test data"
end
puts "Attempting to read " + ARGV.first
########################## Initialize Program Variables Here #################################
# The user will need to specify the field of the outcome as the second argument to the program
# Outcome field indexing starts at 0
outcomeField = ARGV[1].to_i
lineNum = 0
positiveOutcome = nil
thisOutcome = nil
initialSet = nil
########################## Begin program code Here ###########################################
# Count the number of lines in the file, split each line, and separate into features and outcomes
exampleArray = Array.new
if File.extname(ARGV.first) == ".csv"
CSV.foreach(ARGV.first) do |row|
outcome = nil
featureValues = Array.new
for i in 0..row.size-1
if (i == outcomeField)
outcome = row[i]
else
featureValues.push(row[i].to_i)
end
end
puts featureValues.to_s
example = Example.new(featureValues)
example.outcome = outcome
exampleArray.push(example)
end
initialSet = ExampleSet.new(exampleArray)
else
text = File.open(ARGV.first).read
text.gsub!(/\r\n?/, "\n")
text.each_line do |line|
featureArray = line.split(",")
featureArray.last.delete!("\n")
# remove whitespace from features
featureArray.each do |feature|
feature.lstrip!
end
featureValues = Array.new
for i in 0..featureArray.size-1
if (i == outcomeField) && (lineNum == 0)
positiveOutcome = featureArray[i]
thisOutcome = featureArray[i]
elsif (i == outcomeField)
thisOutcome = featureArray[i]
else
featureValues.push(featureArray[i].to_i)
end
end
puts featureValues.to_s
example = Example.new(featureValues)
example.classify(positiveOutcome, thisOutcome)
exampleArray.push(example)
lineNum += 1
end
initialSet = ExampleSet.new(exampleArray)
end
initialSet.calculate
initialSet.to_s
puts "Initial set info = " + initialSet.info.to_s
bestFeature = initialSet.splitOn?
puts "Best feature = " + bestFeature.to_s + ", Gain: " + initialSet.gain(bestFeature).to_s
puts "*********** Building Tree *********"
tree = buildTree(initialSet)
puts "******** Making Decisions *********"
if ARGV.size > 3
analyzeTestSet(ARGV[2], tree, ARGV[3].to_i)
elsif ARGV.size == 3
puts "You'll need to specify test set location AND outcome field"
end
puts $recursiveCalls.to_s + " nodes in the tree"