-
Notifications
You must be signed in to change notification settings - Fork 1
/
training.js
217 lines (215 loc) · 7.52 KB
/
training.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
// Global variables
var keywords = [];
var labels = JSON.parse(localStorage.getItem('categories'));
var count_matrix = [];
var training_data = [];
/**
* Performs the training part of the application.
* Initializes the variables that contain the keywords,
* training data and count matrix.
*/
/**
* Performs the training part of the application: initializes the variables
* that contain the keywords, training data and count matrix.
* @param {Boolean} first_training Indicates if this is the first time training
* is performed; defaults to true.
*/
function training() {
data = localStorage.getItem("training_data");
if(data === null){
localStorage.setItem("training_data", training_data_str);
training_data_str = training_data_str;
}
else {
training_data_str = localStorage.getItem("training_data");
}
// Split the training data into individual ads with a label and some text
training_data = splitTrainingData(training_data_str);
// Create a list of keywords and a dictionary
keywords = buildDictionary(training_data);
// Create count matrix
createCountMatrix(labels, training_data);
// Save training variables to local storage
localStorage.setItem("labels", serializeArray(labels));
localStorage.setItem("keywords", serializeArray(keywords));
localStorage.setItem("count_matrix", serializeMatrix(count_matrix));
/**/console.log("# Keywords = %i\n\n",keywords.length);
}
/**
* Loads previous instances of the training variables from the local storage.
*/
function loadTraining() {
/**/console.log("function loadTraining\n");
// Get training variables from local storage, if the exist
try {
var labels_str = localStorage.getItem("labels");
var keywords_str = localStorage.getItem("keywords");
var matrix_str = localStorage.getItem("count_matrix");
// Convert variables to the original structures
labels = deserializeStrArray(labels_str);
keywords = deserializeStrArray(keywords_str);
count_matrix = deserializeMatrix(matrix_str);
/**/console.log("# Keywords = %i\n\n", keywords.length);
} catch(e) {
console.error(e);
}
}
/**
* Checks if the training has already been performed.
* @return {boolean} True if already trained; false otherwise.
*/
function checkTraining() {
var is_trained = localStorage.getItem("keywords") != null
&& localStorage.getItem("labels") != null
&& localStorage.getItem("count_matrix") != null;
/**/console.log("is_trained =", is_trained);
return is_trained;
}
/* -------------------------- Main Functions -------------------------- */
/**
* Splits the training data into an array of individual adverts.
* Each advert consists of an array containing a label and some text.
* @param {String} training_data String containing the training data.
* @return {Array} Training data split into individual adverts.
*/
function splitTrainingData(training_data) {
// Split adverts (delimited by ';')
training_data = training_data.split(';').slice(0, -1); // "slice" removes extra ';' at the end of the string
var training_data_ = [];
for (var i = 0; i < training_data.length; i++) {
// Separate label and advert text (delimited by '::')
training_data_[i] = training_data[i].split('::');
// Tokenise advert text
training_data_[i][1] = tokeniseText(training_data_[i][1]);
training_data_[i][1] = (training_data_[i][1]).join(' ');
}
return training_data_;
}
/**
* Includes and advert in the training data.
* @param {Array} ad_txt Array containing the advert text.
* @param {String} label Advert label.
*/
var new_ad_counter = 0;
function addTrainingData(ad_txt, label) {
var ad_data = label + ':: ' + ad_txt.join(' ') + ';';
// Add advert to training data
var old_training = localStorage.getItem("training_data");
var new_training = old_training + ad_data;
localStorage.setItem("training_data", new_training);
// Redo training
new_ad_counter += 1;
if(new_ad_counter >= 5){
training();
new_ad_counter = 0;
}
}
/**
* Creates a list of unique keywords appearing in the adverts.
* @param {Array} training_data Training data.
* @return {Array} List of keywords found in training data.
*/
function buildDictionary(training_data) {
var words = getKeywords(training_data);
// Remove repeated words
var keywords = getUniqueWords(words);
return keywords;
}
/**
* Creates a count matrix using the training data
* @param {Array} labels List of advert labels.
* @param {Array} ad_texts Training data.
*/
function createCountMatrix(labels, ad_texts) {
var matrix = [];
var ad_label = '';
// Initialize empty count matrix
initMatrix(labels.length, keywords.length);
// Update count matrix using the training data
for (var ad of ad_texts) {
ad_label = getAdLabel(ad)
tokens = ad[1].split(' ');
// Update count matrix
for (var token of tokens) {
updateCountMatrix(token, ad_label);
}
}
}
/* -------------------------- Other functions -------------------------- */
/**
* Gets all the keywords found in the advert texts.
* @param {Array} training_data Training data.
* @return {Array} List of all the words found.
*/
function getKeywords(training_data) {
var keywords = '';
for (var i = 0; i < training_data.length - 1; i++) {
keywords += ((training_data[i])[1]) + ' '; // String
}
// Don't add blank character at the end of the string
keywords += ((training_data[training_data.length - 1])[1]);
// Convert string to array of keywords
keywords = keywords.split(' ');
return keywords;
}
/**
* Gets the unique words found in an array.
* @param {Array} arr List of words.
* @return {Array} List of unique words found in arr.
*/
function getUniqueWords(arr) {
var words = [], prev;
arr.sort();
for (var i = 0; i < arr.length; i++ ) {
if (arr[i] !== prev) {
words.push(arr[i]);
}
prev = arr[i];
}
return words;
}
/**
* Initializes the count matrix (2d array) and sets all values to 0.
* @param {Number} len_labels Number of labels.
* @param {Number} len_keywords Number of keywords.
*/
function initMatrix(len_labels, len_keywords) {
// First dimension: labels.
// Second dimension: keywords.
for (var i = 0; i < len_labels; i++) {
count_matrix[i] = new Array(len_keywords);
}
for (var i = 0; i < 2; i++) {
for (var j = 0; j < len_keywords; j++) {
count_matrix[i][j] = 0;
}
}
}
/**
* Increments the count matrix element corresponding
* to a keyword and a label.
* @param {String} keyword Keyword.
* @param {String} label Label.
*/
function updateCountMatrix(keyword, label) {
var keyword_index = keywords.indexOf(keyword);
var label_index = labels.indexOf(label);
count_matrix[label_index][keyword_index]++;
}
/**
* Gets the label that corresponds to an advert.
* @param {Array} ad Advert from the training data.
* @return {String} Label corresponding to the advert.
*/
function getAdLabel(ad) {
// TODO: change labels
// Use random condition
return ad[0]
/*if (ad[1].length % 2) {
return "sensitive";
} else {
return "other";
}*/
// Use "location" as a non-sensitive topic
// return (ad[0] == "location")?"other":"sensitive";
}