-
Notifications
You must be signed in to change notification settings - Fork 0
/
featuresEducationNumerical.py
123 lines (89 loc) · 4.19 KB
/
featuresEducationNumerical.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
## Load training data and convert into two-dimensional array.
## Find unique features and construct feature array to be used
## in Perceptron.
def BinarizeData(sort=0, shuffle=0):
rawTrainData = np.genfromtxt("income-data/income.train.txt",
dtype=[('f0', '<i4'), ('f1', 'U17'),
('f2', 'U13'), ('f3', 'U22'),
('f4', 'U18'), ('f5', 'U19'),
('f6', 'U7'), ('f7', '<i4'),
('f8', 'U27'), ('f9', 'U6')],
delimiter=", ")
data = np.array(rawTrainData.tolist())
rawDevData = np.genfromtxt("income-data/income.dev.txt",
dtype=[('f0', '<i4'), ('f1', 'U17'),
('f2', 'U13'), ('f3', 'U22'),
('f4', 'U18'), ('f5', 'U19'),
('f6', 'U7'), ('f7', '<i4'),
('f8', 'U27'), ('f9', 'U6')],
delimiter=", ")
devData = np.array(rawDevData.tolist())
rawTestData = np.genfromtxt("income-data/income.test.txt",
dtype=[('f0', '<i4'), ('f1', 'U17'),
('f2', 'U13'), ('f3', 'U22'),
('f4', 'U18'), ('f5', 'U19'),
('f6', 'U7'), ('f7', '<i4'),
('f8', 'U27'), ('f9', 'U6')],
delimiter=",", autostrip=True)
testData = np.array(rawTestData.tolist())
if sort == 1:
rawTrainData = np.sort(rawTrainData, order='f9', axis=0)
rawTrainData = np.flip(rawTrainData, axis=0)
data = np.array(rawTrainData.tolist())
if shuffle == 1:
np.random.shuffle(data)
age = np.unique(data[:,0])
work = np.unique(data[:,1])
education = np.unique(data[:,2])
maritalstatus = np.unique(data[:,3])
occupation = np.unique(data[:,4])
race = np.unique(data[:,5])
gender = np.unique(data[:,6])
workhours = np.unique(data[:,7])
country = np.unique(data[:,8])
salary = np.unique(data[:,9])
featureArray = np.hstack((work,
maritalstatus, occupation, race, gender, country,
['Age'], ['WorkHours'], ['Education'], ['Bias']))
educationDict = {'Preschool': 0, '1st-4th': 1, '5th-6th': 2,
'7th-8th': 3, '9th': 4, '10th': 5, '11th': 6,
'12th': 7, 'HS-grad': 8, 'Some-college': 9, 'Assoc-voc': 10,
'Assoc-acdm': 11, 'Bachelors': 12, 'Masters': 13,
'Doctorate': 14, 'Prof-school': 15}
binarizedData = []
binarizedDevData = []
binarizedTestData = []
permutation = [6,0,8,1,2,3,4,7,5,9]
isort = np.argsort(permutation)
newdata = data[:, isort]
newDevData = devData[:, isort]
newTestData = testData[:, isort]
for i in range(0, len(data)):
educationVal = newdata[i, -2]
row = np.isin(featureArray[:-4], newdata[i, :-4])
row2 = np.append(row.astype(int), [data[i, 0], data[i, 7],
educationDict[educationVal], 1])
binarizedData.append(row2.astype(int))
toInt = lambda i: int(i == '>50K')
toIntFunc = np.vectorize(toInt)
salary = toIntFunc(data[:,-1:])
finalData = np.concatenate([binarizedData,salary], axis=1)
for i in range(0, len(devData)):
educationVal = newDevData[i, -2]
devRow = np.isin(featureArray[:-4], newDevData[i, :-4])
devRow2 = np.append(devRow.astype(int), [devData[i, 0], devData[i, 7],
educationDict[educationVal], 1])
binarizedDevData.append(devRow2.astype(int))
toInt = lambda i: int(i == '>50K')
toIntFunc = np.vectorize(toInt)
salaryDev = toIntFunc(devData[:,-1:])
finalDevData = np.concatenate([binarizedDevData,salaryDev], axis=1)
for i in range(0, len(testData)):
educationVal = newTestData[i, -2]
testRow = np.isin(featureArray[:-4], newTestData[i, :-4])
testRow2 = np.append(testRow.astype(int), [testData[i, 0], testData[i, 7],
educationDict[educationVal], 1])
binarizedTestData.append(testRow2.astype(int))
finalTestData = np.concatenate([binarizedTestData,testData[:,-1:]], axis=1)
return finalData, finalDevData, finalTestData, featureArray