-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.html
787 lines (704 loc) · 54.6 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
<!DOCTYPE html>
<html >
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<title>Experiments on Natural Language Processing and Prediction of Sentiment Labelled Sentences</title>
<meta name="description" content="Experiments on Natural Language Processing and Prediction of Sentiment Labelled Sentences">
<meta name="generator" content="bookdown 0.3.18 and GitBook 2.6.7">
<meta property="og:title" content="Experiments on Natural Language Processing and Prediction of Sentiment Labelled Sentences" />
<meta property="og:type" content="book" />
<meta name="twitter:card" content="summary" />
<meta name="twitter:title" content="Experiments on Natural Language Processing and Prediction of Sentiment Labelled Sentences" />
<meta name="date" content="2017-04-13">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="black">
<script src="libs/jquery-2.2.3/jquery.min.js"></script>
<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
<style type="text/css">
div.sourceCode { overflow-x: auto; }
table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
margin: 0; padding: 0; vertical-align: baseline; border: none; }
table.sourceCode { width: 100%; line-height: 100%; }
td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
td.sourceCode { padding-left: 5px; }
code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
code > span.dt { color: #902000; } /* DataType */
code > span.dv { color: #40a070; } /* DecVal */
code > span.bn { color: #40a070; } /* BaseN */
code > span.fl { color: #40a070; } /* Float */
code > span.ch { color: #4070a0; } /* Char */
code > span.st { color: #4070a0; } /* String */
code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
code > span.ot { color: #007020; } /* Other */
code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
code > span.fu { color: #06287e; } /* Function */
code > span.er { color: #ff0000; font-weight: bold; } /* Error */
code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
code > span.cn { color: #880000; } /* Constant */
code > span.sc { color: #4070a0; } /* SpecialChar */
code > span.vs { color: #4070a0; } /* VerbatimString */
code > span.ss { color: #bb6688; } /* SpecialString */
code > span.im { } /* Import */
code > span.va { color: #19177c; } /* Variable */
code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
code > span.op { color: #666666; } /* Operator */
code > span.bu { } /* BuiltIn */
code > span.ex { } /* Extension */
code > span.pp { color: #bc7a00; } /* Preprocessor */
code > span.at { color: #7d9029; } /* Attribute */
code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
</style>
</head>
<body>
<div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
<div class="book-summary">
<nav role="navigation">
<ul class="summary">
<li class="chapter" data-level="1" data-path=""><a href="#objectives"><i class="fa fa-check"></i><b>1</b> Objectives</a></li>
<li class="chapter" data-level="2" data-path=""><a href="#data"><i class="fa fa-check"></i><b>2</b> Data</a></li>
<li class="chapter" data-level="3" data-path=""><a href="#problem"><i class="fa fa-check"></i><b>3</b> Problem</a><ul>
<li class="chapter" data-level="3.1" data-path=""><a href="#sentiment-attribute"><i class="fa fa-check"></i><b>3.1</b> Sentiment Attribute</a></li>
<li class="chapter" data-level="3.2" data-path=""><a href="#sentence-attribute"><i class="fa fa-check"></i><b>3.2</b> Sentence Attribute</a></li>
<li class="chapter" data-level="3.3" data-path=""><a href="#measures-of-prediction-quality"><i class="fa fa-check"></i><b>3.3</b> Measures of Prediction Quality</a></li>
</ul></li>
<li class="chapter" data-level="4" data-path=""><a href="#methods"><i class="fa fa-check"></i><b>4</b> Methods</a><ul>
<li class="chapter" data-level="4.1" data-path=""><a href="#preprocessing"><i class="fa fa-check"></i><b>4.1</b> Preprocessing</a><ul>
<li class="chapter" data-level="4.1.1" data-path=""><a href="#parsing"><i class="fa fa-check"></i><b>4.1.1</b> Parsing</a></li>
<li class="chapter" data-level="4.1.2" data-path=""><a href="#features"><i class="fa fa-check"></i><b>4.1.2</b> Features</a></li>
</ul></li>
<li class="chapter" data-level="4.2" data-path=""><a href="#algorithms"><i class="fa fa-check"></i><b>4.2</b> Algorithms</a><ul>
<li class="chapter" data-level="4.2.1" data-path=""><a href="#feature-selection"><i class="fa fa-check"></i><b>4.2.1</b> Feature Selection</a></li>
<li class="chapter" data-level="4.2.2" data-path=""><a href="#cross-validation"><i class="fa fa-check"></i><b>4.2.2</b> Cross Validation</a></li>
<li class="chapter" data-level="4.2.3" data-path=""><a href="#selected-learners"><i class="fa fa-check"></i><b>4.2.3</b> Selected Learners</a></li>
</ul></li>
</ul></li>
<li class="chapter" data-level="5" data-path=""><a href="#results"><i class="fa fa-check"></i><b>5</b> Results</a></li>
<li class="chapter" data-level="6" data-path=""><a href="#discussion-and-conclusion"><i class="fa fa-check"></i><b>6</b> Discussion and Conclusion</a></li>
<li class="chapter" data-level="7" data-path=""><a href="#references"><i class="fa fa-check"></i><b>7</b> References</a></li>
<li class="chapter" data-level="8" data-path=""><a href="#appendix-a-r-code"><i class="fa fa-check"></i><b>8</b> Appendix A: R Code</a></li>
</ul>
</nav>
</div>
<div class="book-body">
<div class="body-inner">
<div class="book-header" role="navigation">
<h1>
<i class="fa fa-circle-o-notch fa-spin"></i><a href="./">Experiments on Natural Language Processing and Prediction of Sentiment Labelled Sentences</a>
</h1>
</div>
<div class="page-wrapper" tabindex="-1" role="main">
<div class="page-inner">
<section class="normal" id="section-">
<div id="header">
<h1 class="title">Experiments on Natural Language Processing and Prediction of Sentiment Labelled Sentences</h1>
<h3 class="subtitle"><em>Assignment Submission for Course CP8305 Instructed by Dr. Cherie Ding</em></h3>
<h4 class="author"><em><div style="white-space: pre-line;">Richard Wen
Ryerson University</div></em></h4>
<h4 class="date"><em>April 13, 2017</em></h4>
</div>
<div id="objectives" class="section level1">
<h1><span class="header-section-number">1</span> Objectives</h1>
<p>This assignment was provided by Dr. Cherie Ding for the CP8305 Knowledge Discovery course at Ryerson University. The purposes of the assignment was to:</p>
<ol style="list-style-type: decimal">
<li>Identify a practical dataset from the University of California, Irvine (UCI) Machine Learning Repository <span class="citation">(Lichman 2013)</span> or Knowledge Discovery in Databases (KDD) Cup Archives <span class="citation">(SIGKDD 2013)</span><br />
</li>
<li>Identify a machine learning problem with the chosen dataset in (1)<br />
</li>
<li>Apply various machine learning algorithms to the problem in (2) to find algorithms that can solve (2) well and provide insight into the data in (1)</li>
</ol>
</div>
<div id="data" class="section level1">
<h1><span class="header-section-number">2</span> Data</h1>
<p>The data chosen for this assignment was the Sentiment Labelled Sentences (SLS) Dataset donated on May 30, 2015 and downloaded from the UCI Machine Learning Repository <span class="citation">(Kotzias et al. 2015)</span>. There are 3 text files (amazon_cells_labelled.txt, imdb_labelled.txt, yelp_labelled.txt) with a combined total of 3000 instances, absent of missing values. Each file consists of 2 attributes with the first attribute being sentences from (string type) and the second being a binary class of either 0 for negative sentiment or 1 for positive sentiment (numeric type). The data in each file had attributes separated by a mixture of inconsistent spaces and tabs, and instances separated by rows. An example of the first 5 rows are shown in Table <a href="#tab:tab1">2.1</a><a href="#fn1" class="footnoteRef" id="fnref1"><sup>1</sup></a>. Sentences were extracted by <span class="citation">Kotzias et al. (2015)</span> from <a href="www.imdb.com">imdb.com</a>, <a href="www.amazon.com">amazon.com</a>, and <a href="www.yelp.com">yelp.com</a>. These websites represent a movie database, online retailer, and a online business directory with crowd-sourced reviews, respectively. The SLS files are summarized in Table <a href="#tab:tab2">2.2</a><a href="#fn2" class="footnoteRef" id="fnref2"><sup>2</sup></a>.</p>
<table>
<caption><span id="tab:tab1">Table 2.1: </span>SLS Dataset Example for amazon Data</caption>
<thead>
<tr class="header">
<th align="right">Line</th>
<th align="left">Sample</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="right">1</td>
<td align="left">So there is no way for me to plug it in here in the US unless I go by a converter. (tab) 0</td>
</tr>
<tr class="even">
<td align="right">2</td>
<td align="left">Good case, Excellent value. (tab) 1</td>
</tr>
<tr class="odd">
<td align="right">3</td>
<td align="left">Great for the jawbone. (tab) 1</td>
</tr>
<tr class="even">
<td align="right">4</td>
<td align="left">Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!! (tab) 0</td>
</tr>
<tr class="odd">
<td align="right">5</td>
<td align="left">The mic is great. (tab) 1</td>
</tr>
</tbody>
</table>
<table>
<caption><span id="tab:tab2">Table 2.2: </span>SLS Dataset Summary</caption>
<thead>
<tr class="header">
<th align="left">File</th>
<th align="right">Instances</th>
<th align="right">Attributes</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">amazon_cells_labelled.txt</td>
<td align="right">1000</td>
<td align="right">2</td>
</tr>
<tr class="even">
<td align="left">imdb_labelled.txt</td>
<td align="right">1000</td>
<td align="right">2</td>
</tr>
<tr class="odd">
<td align="left">yelp_labelled.txt</td>
<td align="right">1000</td>
<td align="right">2</td>
</tr>
</tbody>
</table>
</div>
<div id="problem" class="section level1">
<h1><span class="header-section-number">3</span> Problem</h1>
<p>The goal of the SLS dataset was to predict the values in attribute 2 (sentiment) which contain 0 or 1 for negative and positive sentiment respectively. The binary values of attribute 2 defined the SLS dataset as a classification problem. In order to predict sentiment, sentences from three different websites (<a href="www.imdb.com">imdb.com</a>, <a href="www.amazon.com">amazon.com</a>, and <a href="www.yelp.com">yelp.com</a>) were given as the explanatory attribute 1. The text values of attribute 1 further defined the problem as a Natural Language Processing (NLP)<a href="#fn3" class="footnoteRef" id="fnref3"><sup>3</sup></a> problem where the attribute values were unstructured, and required pre-processing before the machine could read and learn to model the SLS data. For simplicity, attribute 1 was referred to as the sentence attribute and attribute 2 was referred to as the sentiment attribute. The problem was then known as a supervised classification<a href="#fn4" class="footnoteRef" id="fnref4"><sup>4</sup></a> problem for the sentiment attribute that required NLP of the sentence attribute. The following sections more formally define the problem framework.</p>
<div id="sentiment-attribute" class="section level2">
<h2><span class="header-section-number">3.1</span> Sentiment Attribute</h2>
<p>The sentiment attribute was the second attribute in the SLS dataset. It is the target vector <span class="math inline">\(y^{(n)}\)</span> containing binary values of either 0 or 1 given <span class="math inline">\(n\)</span> instances as seen in Equation <a href="#eq:eq1">(3.1)</a>:</p>
<span class="math display" id="eq:eq1">\[\begin{equation}
y^{(n)} \in \{0, 1 \}
\tag{3.1}
\end{equation}\]</span>
</div>
<div id="sentence-attribute" class="section level2">
<h2><span class="header-section-number">3.2</span> Sentence Attribute</h2>
<p>The sentence attribute was the first attribute in the SLS dataset. It is the raw text data <span class="math inline">\(x^{(n)}\)</span> given <span class="math inline">\(n\)</span> instances such that it contains <span class="math inline">\(k\)</span> number of words <span class="math inline">\(w^{(k, n)}\)</span> (separated by spaces<a href="#fn5" class="footnoteRef" id="fnref5"><sup>5</sup></a>), where the word lengths<a href="#fn6" class="footnoteRef" id="fnref6"><sup>6</sup></a> <span class="math inline">\(l_w\)</span> of <span class="math inline">\(w^{(k, n)}\)</span> and <span class="math inline">\(k\)</span> are less than the length <span class="math inline">\(l_x\)</span> of the raw texts <span class="math inline">\(x^{(n)}\)</span> as seen in Equation <a href="#eq:eq2">(3.2)</a>:</p>
<span class="math display" id="eq:eq2">\[\begin{equation}
w^{(k, n)} \in x^{(n)} \mid \enspace 0 < l_w^{(k)} \leq l_x^{(n)} \enspace and \enspace 0 < k \leq l_x^{(n)}
\tag{3.2}
\end{equation}\]</span>
</div>
<div id="measures-of-prediction-quality" class="section level2">
<h2><span class="header-section-number">3.3</span> Measures of Prediction Quality</h2>
<p>The classification problem given <span class="math inline">\(x^{(n)}\)</span> as the explanatory data and <span class="math inline">\(y^{(n)}\)</span> as the target classes was to obtain measurement values that define an algorithm to predict <span class="math inline">\(y^{(n)}\)</span> well. Predicted classes <span class="math inline">\(y_{pred}^{(n)}\)</span> are obtained using <span class="math inline">\(x^{(n)}\)</span> training features seen in Equation <a href="#eq:eq3">(3.3)</a>:</p>
<span class="math display" id="eq:eq3">\[\begin{equation}
y_{pred}^{(n)} = f(x^{(n)})
\tag{3.3}
\end{equation}\]</span>
<p>The classification prediction quality used measurements that were based on the <span class="math inline">\(f_{eq}\)</span> counts of <span class="math inline">\(y_{pred}^{(n)}\)</span> that were equal to <span class="math inline">\(y^{(n)}\)</span> given the total number of instances <span class="math inline">\(N\)</span> expressed in Equation <a href="#eq:eq4">(3.4)</a>:</p>
<span class="math display" id="eq:eq4">\[\begin{equation}
f_{eq} = \sum_{n=1}^N y_{eq} \mid
y_{eq} =
\begin{cases}
1: & \text{if}\ y_{pred}^{(n)} = y^{(n)} \\
0: & \text{otherwise}
\end{cases}
\tag{3.4}
\end{equation}\]</span>
<p>Accuracy measurements were defined as a maximization problem, where higher values are better and lower values are worse. Error measurements were defined as a minimization problem, where lower values are better and higher values are worse.</p>
<p>An accuracy measurement <span class="math inline">\(f_{acc}\)</span>, where <span class="math inline">\(C_{acc}\)</span> is a constant representing the best prediction quality, increases the more times <span class="math inline">\(y_{pred}^{(n)}\)</span> is equal to <span class="math inline">\(y^{(n)}\)</span> given <span class="math inline">\(f_{eq}\)</span> as seen in Equation <a href="#eq:eq5">(3.5)</a>:</p>
<span class="math display" id="eq:eq5">\[\begin{equation}
\lim_{f_{acc} \to C_{acc}} f_{acc} \enspace as \enspace f_{eq} \rightarrow \infty \enspace \mid f_{acc} = f(y_{pred}^{(n)}, y^{(n)})
\tag{3.5}
\end{equation}\]</span>
<p>An error measurement <span class="math inline">\(f_{err}\)</span>, where <span class="math inline">\(C_{err}\)</span> is a constant representing the worse prediction quality, decreases the more times <span class="math inline">\(y_{pred}^{(n)}\)</span> is equal to <span class="math inline">\(y^{(n)}\)</span> given <span class="math inline">\(f_{eq}\)</span> as seen in Equation <a href="#eq:eq6">(3.6)</a>:</p>
<span class="math display" id="eq:eq6">\[\begin{equation}
\lim_{f_{err} \to C_{err}} f_{err} \enspace as \enspace f_{eq} \rightarrow \infty \enspace \mid f_{err} = f(y_{pred}^{(n)}, y^{(n)})
\tag{3.6}
\end{equation}\]</span>
</div>
</div>
<div id="methods" class="section level1">
<h1><span class="header-section-number">4</span> Methods</h1>
<p>The methods described in this section attempted to experiment with several solutions to the classification problem defined in Section <a href="#problem">3</a>. The unstructured nature of the sentence attribute presented required preprocessing to create features which are then further processed to search for adequately useful features (selection). These features were then randomly split into approximately equal number of instances for cross validation training sets. These training sets were then used as input for a selected number of algorithms and evaluated for prediction quality. A summary of the methods is shown in Figure <a href="#fig:fig1">4.1</a>. See Appendix <a href="#appendix-a-r-code">A</a> for the R code.</p>
<div class="figure"><span id="fig:fig1"></span>
<img src="img/fig1.png" alt="Flowchart of Methods" width="299" />
<p class="caption">
Figure 4.1: Flowchart of Methods
</p>
</div>
<div id="preprocessing" class="section level2">
<h2><span class="header-section-number">4.1</span> Preprocessing</h2>
<p>The preprocessing steps involved getting the data ready for the machine learning algorithms to train on. This involved creating features from the instances in the sentence attribute and selecting only the features that are measured to be useful in terms of predicting the target sentiment attribute. Furthermore, the data was also split into cross validation training sets and used as the resulting training data for the machine learning algorithms.</p>
<div id="parsing" class="section level3">
<h3><span class="header-section-number">4.1.1</span> Parsing</h3>
<p>The data was defined as a tab delimited text file that held string based text as the first sentence attribute and numeric binary numbers as the second sentiment attribute. The unstructured nature of the sentence attribute made parsing the file less straight-forward as instances in the sentence attribute were not quoted and could contain any number of tab characters. The data was:</p>
<ol style="list-style-type: decimal">
<li>Parsed line by line</li>
<li>Cleaned by removing the last occurence of tab characters and punctuation</li>
<li>Extracted for sentiment and sentence instances where the last character was defined as the sentiment, and the rest of the text was defined as the sentence</li>
</ol>
</div>
<div id="features" class="section level3">
<h3><span class="header-section-number">4.1.2</span> Features</h3>
<p>The features<a href="#fn7" class="footnoteRef" id="fnref7"><sup>7</sup></a> were created using a simple bag of words model. The bag of words model constructs a feature for each unique word in the sentence attribute and counts the occurrence per sentence instance in the SLS dataset <span class="citation">(Nadkarni, Ohno-Machado, and Chapman 2011)</span>. Each <span class="math inline">\(k\)</span> word feature <span class="math inline">\(w^{(k)}\)</span> is a count of the occurrence of <span class="math inline">\(k\)</span> unique words in the sentence attribute <span class="math inline">\(x^{(n)}\)</span> given <span class="math inline">\(n\)</span> instances. For clarification, all words were considered, including stop words<a href="#fn8" class="footnoteRef" id="fnref8"><sup>8</sup></a>, as these would be removed in the feature selection process if they were measured as being not very useful - thus, the possibility of them being useful was considered. An example of word features is given in Table <a href="#tab:tab3">4.1</a>. Feature construction was applied using the <em>text2vec</em> R package <span class="citation">(Selivanov 2016)</span>.</p>
<table>
<caption><span id="tab:tab3">Table 4.1: </span>Example of Word Features</caption>
<thead>
<tr class="header">
<th align="right">Word-1</th>
<th align="right">Word-2</th>
<th align="right">Word-k</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="right">2</td>
<td align="right">5</td>
<td align="right">10</td>
</tr>
<tr class="even">
<td align="right">5</td>
<td align="right">2</td>
<td align="right">5</td>
</tr>
<tr class="odd">
<td align="right">10</td>
<td align="right">10</td>
<td align="right">2</td>
</tr>
</tbody>
</table>
</div>
</div>
<div id="algorithms" class="section level2">
<h2><span class="header-section-number">4.2</span> Algorithms</h2>
<p>The algorithms applied to solve the classification problem involved optimizing features and a measure of prediction quality based on training and testing on cross validation folds.</p>
<div id="feature-selection" class="section level3">
<h3><span class="header-section-number">4.2.1</span> Feature Selection</h3>
<p>Feature selection was used to filter the word features <span class="math inline">\(w^{(k)}\)</span> for the most important word features. A measure of importance was defined by using the Random Forest (RF) algorithm. The RF algorithm ensembles multiple decision trees by randomly subsetting portions of the complete data to construct each tree <span class="citation">(Breiman 2001)</span>. For each portion of data subsetted, there were remaining portions that were used to calculate an Out Of Bag (OOB) error by using the constructed decison trees to predict on the remaining data portions. These remaining portions were referred to as the OOB samples, and were used in calculating the feature importances. Each <span class="math inline">\(k\)</span> word feature <span class="math inline">\(w^{(k)}\)</span> was then calculated for feature importances by randomly permuting prediction errors on the OOB samples, and comparing their differences on the permuted and un-permuted errors. These feature importances quantify the usefulness of word features for the classification problem, where higher values represent more useful features and lower values represent less useful features. Initially, features in which the occurrence of the word is less than 0.1% of the total number of occurrences were removed, followed by strict removal of features that were slightly to highly correlated, where features with correlation values of greater than 0.05 were removed. This enabled independence assumptions to be better satisfied as there were an abundance of features. The feature selection then filtered for the top 10 features with the highest feature importance based on a random forest classifier with 500 trees. The feature selection algorithm was applied using the <em>caret</em> package <span class="citation">(Kuhn 2016)</span>.</p>
</div>
<div id="cross-validation" class="section level3">
<h3><span class="header-section-number">4.2.2</span> Cross Validation</h3>
<p>The data from the feature selection was split into a standard 10-fold cross validation scenario. In the 10-fold cross validation scenario, the data was randomly split into 10 equal parts, trained on 9 parts, and tested on the 1 part not in the 9 training parts until all testing parts have been tested for <span class="citation">(Borra and Di Ciaccio 2010)</span>. Cross validation enabled prediction quality measures to evaluate the generalization of algorithms as different parts of the data were used to learn from and predict on. This generalization quality more accurately represented real-world scenarios as there are often missing portions of data, and a complete dataset is often not available as data is continuously updated. Cross validation was also applied using the <em>caret</em> R package.</p>
</div>
<div id="selected-learners" class="section level3">
<h3><span class="header-section-number">4.2.3</span> Selected Learners</h3>
<p>The learners selected for the experiments were standard machine learning choices that were relatively efficient and simple to run as the strict correlation removal procedure and feature selection process enabled relatively independent features of low dimensionality. The selected machine learning algorithms are the Naive Bayes Classifier (NBC), Support Vector Machine (SVM) and Random Forest (RF) from the R package <em>e1071</em> <span class="citation">(Meyer 2017)</span>. NBC is a conditional probability model that applies Baye’s theorem with the assumption that the features being trained on are independent of each other <span class="citation">(McCallum and Kamal 1998)</span>. RF is a model that constructs many decision trees from random subsamples of the instances and features, and ensembles them for a majority vote of the predicted class <span class="citation">(Breiman 2001)</span>. SVM is a non-probabilistic binary model that constructs a set of hyperplanes in high dimensional space to separate target classes <span class="citation">(Tong and Koller 2001)</span>. The selected algorithms were evaluated based on their F1 score to account for potential binary class imbalances in the sentiment attribute. The F1 score (Equation <a href="#eq:eq9">(4.3)</a>) is the harmonic mean of the precision (Equation <a href="#eq:eq8">(4.2)</a>) and recall (Equation <a href="#eq:eq7">(4.1)</a>) given <span class="math inline">\(N\)</span> instances, <span class="math inline">\(Y_{s}\)</span> correctly predicted instances for a sentiment <span class="math inline">\(s\)</span>, and the number of instances for a sentiment <span class="math inline">\(N_{s}\)</span>. See the learners section under Appendix <a href="#appendix-a-r-code">A</a> for more details.</p>
<span class="math display" id="eq:eq7">\[\begin{equation}
Precision = Y_{s} / N
\tag{4.1}
\end{equation}\]</span>
<span class="math display" id="eq:eq8">\[\begin{equation}
Recall = Y_{s} / N_{s}
\tag{4.2}
\end{equation}\]</span>
<span class="math display" id="eq:eq9">\[\begin{equation}
F1 = 2 \times \dfrac{Precision \times Recall}{Precision + Recall}
\tag{4.3}
\end{equation}\]</span>
</div>
</div>
</div>
<div id="results" class="section level1">
<h1><span class="header-section-number">5</span> Results</h1>
<p>Figure <a href="#fig:fig2">5.1</a> shows the distribution of classes in the sentiment attribute. The results ranged from CV F1 scores of 0.26 to 0.72 per fold. Figure <a href="#fig:fig3">5.2</a> shows the average CV F1 scores obtained from the experiments. The word feature importances are also shown in Figure <a href="#fig:fig4">5.3</a>. The top ten word features based on feature importances was selected from these options, which produced a training feature matrix of 10 columns (reduced from 5189) by 3000 rows, with each <span class="math inline">\(k\)</span> column representing <span class="math inline">\(w^{k}\)</span> word vectors.</p>
<div class="figure"><span id="fig:fig2"></span>
<img src="index_files/figure-html/fig2-1.png" alt="Sentiment Class Distribution" width="672" />
<p class="caption">
Figure 5.1: Sentiment Class Distribution
</p>
</div>
<div class="figure"><span id="fig:fig3"></span>
<img src="index_files/figure-html/fig3-1.png" alt="Average F1 Scores" width="672" />
<p class="caption">
Figure 5.2: Average F1 Scores
</p>
</div>
<div class="figure"><span id="fig:fig4"></span>
<img src="index_files/figure-html/fig4-1.png" alt="Feature Importances" width="672" />
<p class="caption">
Figure 5.3: Feature Importances
</p>
</div>
</div>
<div id="discussion-and-conclusion" class="section level1">
<h1><span class="header-section-number">6</span> Discussion and Conclusion</h1>
<p>Both the RF and SVM algorithms performed similarly, while the NBC algorithm performed better of the three selected algorithms. However, the random forest was useful in calculating feature importances used in the feature selection process to reduce the number of dimensions on the data, as well as quantify how much more important one word feature was compared to another word feature. In particular, the word features <em>love, nice, works, no,</em> and <em>worst</em> were very useful in classifying the sentiment attribute consisting of 0 or 1s indicating negative and positive respectively. Although, the RF and SVM performed rather poorly, the results were reasonable given the large reduction in features and minimal hyperparameter tuning. This is done in exchange for interpretability of results to see what influences positive and negative sentiment without manually reviewing thousands of online reactions, or having advanced knowledge of specific machine learning algorithms. The high dimensionality of the data after processing for the bag of words model made it difficult to obtain quick and efficient methods of feature reduction, however the abundance of features made it more flexible when arbitrary rules were imposed - such as the strict correlation values imposed in the feature selection process. This led to better features for the NBC algorithm, which resulted in the highest average CV F1 score without heavy hyperparameter tuning compared to the RF and SVM. The NBC was a simple, yet efficient and effective algorithm, when features were selected to meet its assumptions of feature independence. Algorithms that can efficiently either utilize high dimensional data or reduce dimensionality with minimal information loss, but still enable the results to be interpreted would be very desirable in machine learning problems involving text based attributes that can be converted into a bag of words model.</p>
</div>
<div id="references" class="section level1">
<h1><span class="header-section-number">7</span> References</h1>
<div id="refs">
<div id="ref-borra2010">
<p>Borra, Simone, and Agostino Di Ciaccio. 2010. “Measuring the Prediction Error. A Comparison of Cross-Validation, Bootstrap and Covariance Penalty Methods.” <em>Computational Statistics and Data Analysis</em> 54 (12). Elsevier: 2876–2989. doi:<a href="https://doi.org/10.1016/j.csda.2010.03.004">10.1016/j.csda.2010.03.004</a>.</p>
</div>
<div id="ref-breiman2001">
<p>Breiman, Leo. 2001. “Random Forests.” <em>Machine Learning</em> 45 (1). Springer: 5–32. doi:<a href="https://doi.org/10.1023/A:1010933404324">10.1023/A:1010933404324</a>.</p>
</div>
<div id="ref-kotzias2015">
<p>Kotzias, Dimitrios, Misha Denil, De Freitas Nando, and Padhraic Smyth. 2015. “From Group to Individual Labels Using Deep Features.” <em>Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</em>. Association for Computing Machinery, 597–606. doi:<a href="https://doi.org/10.1145/2783258.2783380">10.1145/2783258.2783380</a>.</p>
</div>
<div id="ref-kuhn2016">
<p>Kuhn, Max. 2016. “Caret: Classification and Regression Training.” <em>Journal of Statistical Software</em> 28 (5): 1–5. <a href="https://CRAN.R-project.org/package=caret" class="uri">https://CRAN.R-project.org/package=caret</a>.</p>
</div>
<div id="ref-lichman2013">
<p>Lichman, Moshe. 2013. “(UCI) Machine Learning Repository.” <a href="http://archive.ics.uci.edu/ml" class="uri">http://archive.ics.uci.edu/ml</a>.</p>
</div>
<div id="ref-mccallum1998">
<p>McCallum, Andrew, and Nigam Kamal. 1998. “A Comparison of Event Models for Naive Bayes Text Classification.” <em>AAAI-98 Workshop on Learning for Text Categorization</em>.</p>
</div>
<div id="ref-meyer2017">
<p>Meyer, David. 2017. “E1071: Misc Functions of the Department of Statistics, Probability Theory Group (Formerly: E1071), TU Wien.” <a href="https://cran.r-project.org/package=e1071" class="uri">https://cran.r-project.org/package=e1071</a>.</p>
</div>
<div id="ref-prakash2011">
<p>Nadkarni, Prakash M, Lucila Ohno-Machado, and Wendy W Chapman. 2011. “Natural Language Processing: An Introduction.” <em>Journal of the American Medical Informatics Association</em> 18 (5). PubMed Central: 544–51. doi:<a href="https://doi.org/10.1136/amiajnl-2011-000464">10.1136/amiajnl-2011-000464</a>.</p>
</div>
<div id="ref-selivanov2016">
<p>Selivanov, Dmitriy. 2016. “Text2vec: Modern Text Mining Framework for R.” <a href="https://cran.r-project.org/package=text2vec" class="uri">https://cran.r-project.org/package=text2vec</a>.</p>
</div>
<div id="ref-sigkdd2016">
<p>SIGKDD. 2013. “KDD Cup Archives.” <a href="www.kdd.org/kdd-cup" class="uri">www.kdd.org/kdd-cup</a>.</p>
</div>
<div id="ref-tong2001">
<p>Tong, Simon, and Koller. 2001. “Support Vector Machine Active Learning with Applications to Text Classification.” <em>Journal of Machine Learning Research</em>.</p>
</div>
</div>
</div>
<div id="appendix-a-r-code" class="section level1">
<h1><span class="header-section-number">8</span> Appendix A: R Code</h1>
<p><strong>A0. Dependencies</strong></p>
<p>The following R code installs required package dependencies if not installed and loads them into the R environment.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Dependencies ----</span>
<span class="co"># (package_install) Required packages and loading</span>
packages <-<span class="st"> </span><span class="kw">c</span>(
<span class="st">"randomForest"</span>,
<span class="st">"text2vec"</span>,
<span class="st">"caret"</span>,
<span class="st">"e1071"</span>
)
<span class="cf">for</span> (pkg <span class="cf">in</span> packages) {
<span class="cf">if</span> (<span class="op">!</span><span class="kw">requireNamespace</span>(pkg, <span class="dt">quietly=</span><span class="ot">TRUE</span>)) {
<span class="kw">install.packages</span>(pkg,
<span class="dt">dependencies=</span><span class="ot">TRUE</span>,
<span class="dt">repos=</span><span class="st">"http://cran.rstudio.com/"</span>)
}
<span class="kw">library</span>(pkg, <span class="dt">character.only=</span><span class="ot">TRUE</span>)
}</code></pre></div>
<p><strong>A1. Data</strong></p>
<p>The SLS dataset is downloaded as a .zip file into a temporary directory, unzipped, and collected for file path information stored as variable <strong>slsFiles</strong>. See <a href="#data">Data</a> section.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Data ----</span>
<span class="co"># 1. Download the zipped SLS data from UCI into a temp dir</span>
temp <-<span class="st"> </span><span class="kw">tempdir</span>()
src <-<span class="st"> </span><span class="kw">paste0</span>(<span class="st">"http://archive.ics.uci.edu/ml/machine-learning-databases/"</span>,
<span class="st">"00331/sentiment%20labelled%20sentences.zip"</span>)
zipped <-<span class="st"> </span><span class="kw">file.path</span>(temp, <span class="kw">basename</span>(src))
<span class="kw">download.file</span>(src, zipped)
<span class="kw">unzip</span>(zipped, <span class="dt">exdir=</span>temp)
<span class="co"># 2. Obtain SLS data paths from unzipped folder</span>
slsFolder <-<span class="st"> </span><span class="kw">file.path</span>(temp, <span class="st">"sentiment labelled sentences"</span>)
slsIgnore <-<span class="st"> </span><span class="kw">file.path</span>(slsFolder, <span class="kw">c</span>(<span class="st">".DS_Store"</span>, <span class="st">"readme.txt"</span>))
slsFiles <-<span class="st"> </span><span class="kw">list.files</span>(slsFolder, <span class="dt">full.names=</span><span class="ot">TRUE</span>)
slsFiles <-<span class="st"> </span>slsFiles[<span class="op">!</span>slsFiles <span class="op">%in%</span><span class="st"> </span>slsIgnore]</code></pre></div>
<p><em>SLS Dataset Files</em></p>
<table>
<thead>
<tr class="header">
<th align="left">Files</th>
<th align="right">Bytes</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">amazon_cells_labelled.txt</td>
<td align="right">58226</td>
</tr>
<tr class="even">
<td align="left">imdb_labelled.txt</td>
<td align="right">85285</td>
</tr>
<tr class="odd">
<td align="left">yelp_labelled.txt</td>
<td align="right">61320</td>
</tr>
</tbody>
</table>
<p><strong>A2. Parsing</strong></p>
<p>Parsing is done to read the SLS data, given by the file path information <strong>slsFiles</strong> of the previous code, into a R dataframe named <strong>ds</strong> with columns sentence and sentiment, where each row represents the instances. The sentence column contains the explanatory text data and the sentiment column contains the binary data of 0s and 1s. See <a href="#parsing">Parsing</a> section.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Parsing ----</span>
<span class="co"># 1. Read the data on a line by line basis</span>
sls <-<span class="st"> </span><span class="kw">sapply</span>(slsFiles, readLines)
<span class="co"># 2. Extract sentence and sentiment attributes</span>
x <-<span class="st"> </span><span class="kw">c</span>() <span class="co"># sentence attribute</span>
y <-<span class="st"> </span><span class="kw">c</span>() <span class="co"># sentiment attribute</span>
<span class="cf">for</span> (n <span class="cf">in</span> sls) { <span class="co"># each instance n</span>
lx <-<span class="st"> </span><span class="kw">nchar</span>(n) <span class="co"># sentence length</span>
xn <-<span class="st"> </span><span class="kw">substr</span>(n, <span class="dv">1</span>, lx <span class="op">-</span><span class="st"> </span><span class="dv">2</span>) <span class="co"># sentence instance</span>
xn <-<span class="st"> </span><span class="kw">gsub</span>(<span class="st">'[[:punct:] ]+'</span>,<span class="st">' '</span>, xn) <span class="co"># remove punctuation</span>
yn <-<span class="st"> </span><span class="kw">as.numeric</span>(<span class="kw">substr</span>(n, lx <span class="op">-</span><span class="st"> </span><span class="dv">1</span>, lx)) <span class="co"># sentiment instance</span>
x <-<span class="st"> </span><span class="kw">c</span>(x, xn)
y <-<span class="st"> </span><span class="kw">c</span>(y, yn)
}
ds <-<span class="st"> </span><span class="kw">data.frame</span>(<span class="dt">sentence =</span> x, <span class="dt">sentiment =</span> y, <span class="dt">stringsAsFactors=</span><span class="ot">FALSE</span>)</code></pre></div>
<p><em>Example of Parsed Dataframe</em></p>
<table>
<thead>
<tr class="header">
<th align="left">sentence</th>
<th align="right">sentiment</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="left">So there is no way for me to plug it in here in the US unless I go by a converter</td>
<td align="right">0</td>
</tr>
<tr class="even">
<td align="left">Good case Excellent value</td>
<td align="right">1</td>
</tr>
<tr class="odd">
<td align="left">Great for the jawbone</td>
<td align="right">1</td>
</tr>
<tr class="even">
<td align="left">Tied to charger for conversations lasting more than 45 minutes MAJOR PROBLEMS</td>
<td align="right">0</td>
</tr>
<tr class="odd">
<td align="left">The mic is great</td>
<td align="right">1</td>
</tr>
</tbody>
</table>
<p><strong>A3. Features</strong></p>
<p>Features are then constructed using the <code>?text2vec</code> word tokenizer to create a bag of words model. This creates variable <strong>wnk</strong> as a matrix where each column are the unique words in all sentence columns of the <strong>ds</strong> data, and each row represents the counts of the words for an instance. See <a href="#features">Features</a> section.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Features ----</span>
<span class="co"># 0. Variables</span>
xn <-<span class="st"> </span>ds<span class="op">$</span>sentence
<span class="co"># 1. Obtain words wn for each xn using a tokenizer</span>
wn <-<span class="st"> </span><span class="kw">itoken</span>(xn, tolower, word_tokenizer)
<span class="co"># 2. Vocabulary of wk words for wn </span>
wk <-<span class="st"> </span><span class="kw">vocab_vectorizer</span>(<span class="kw">create_vocabulary</span>(wn))
<span class="co"># 3. Obtain word features matrix wnk given n instances and k words</span>
wnk <-<span class="st"> </span><span class="kw">as.matrix</span>(<span class="kw">get_dtm</span>(<span class="kw">create_corpus</span>(wn, wk)))</code></pre></div>
<p><em>Example of bag of words matrix</em></p>
<table>
<thead>
<tr class="header">
<th align="right">love</th>
<th align="right">nice</th>
<th align="right">works</th>
<th align="right">worst</th>
<th align="right">no</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">1</td>
</tr>
<tr class="even">
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
</tr>
<tr class="odd">
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
</tr>
<tr class="even">
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
</tr>
<tr class="odd">
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
<td align="right">0</td>
</tr>
</tbody>
</table>
<p><strong>A4. Selection</strong></p>
<p>Feature selection was then done to reduce the dimensionality of the <strong>wnk</strong> bag of words matrix into the most useful features for learning. <strong>wnk</strong> then has features removed in which:</p>
<ol style="list-style-type: decimal">
<li>Words occurence is less than 0.1% of the total occurences</li>
<li>Correlation value <code>?caret:findCorrelation</code> with another feature is over 0.05</li>
<li>Random Forest variable importance measure <code>?randomForest::importance</code> is not in the top 10 highest values</li>
</ol>
<p>See <a href="#feature-selection">Feature Selection</a> section.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Selection ----</span>
<span class="co"># 0. Variables</span>
yn <-<span class="st"> </span><span class="kw">as.factor</span>(ds<span class="op">$</span>sentiment)
<span class="co"># 1. Keep only words that occur more than 0.1% of occurences</span>
freqc <-<span class="st"> </span><span class="kw">apply</span>(wnk, <span class="dv">2</span>, sum)
wnk <-<span class="st"> </span>wnk[, <span class="kw">which</span>(freqc <span class="op">></span><span class="st"> </span><span class="kw">sum</span>(freqc) <span class="op">*</span><span class="st"> </span><span class="fl">0.001</span>)]
<span class="co"># 2. Remove highly correlated features over 0.05</span>
wcor <-<span class="st"> </span><span class="kw">findCorrelation</span>(<span class="kw">cor</span>(wnk), <span class="dt">cutoff =</span> <span class="fl">0.05</span>)
wnk <-<span class="st"> </span>wnk[, <span class="op">-</span>wcor]
<span class="co"># 4. Calculate RF feature importances and select top 10</span>
rfFit <-<span class="st"> </span><span class="kw">randomForest</span>(
<span class="dt">x =</span> wnk,
<span class="dt">y =</span> yn,
<span class="dt">importance =</span> <span class="ot">TRUE</span>,
<span class="dt">proximity =</span> <span class="ot">TRUE</span>,
<span class="dt">ntree =</span> <span class="dv">500</span>
)
impw <-<span class="st"> </span><span class="kw">importance</span>(rfFit)[, <span class="dv">4</span>]
impw <-<span class="st"> </span>impw[<span class="kw">order</span>(<span class="op">-</span>impw)]
wnk <-<span class="st"> </span>wnk[, <span class="kw">names</span>(impw)[<span class="dv">1</span><span class="op">:</span><span class="dv">10</span>]]</code></pre></div>
<p><em>Plot of the Feature Importances</em></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">barplot</span>(impw, <span class="dt">names.arg =</span> <span class="kw">names</span>(impw), <span class="dt">ylab =</span> <span class="st">"Feature Importance"</span>, <span class="dt">las=</span><span class="dv">2</span>)</code></pre></div>
<p><img src="index_files/figure-html/appendix-a4a-1.png" width="672" /></p>
<p><strong>A5. Cross Validation</strong></p>
<p>The row indices of the <strong>wnk</strong> bag of words matrix is then divided into 10 equal parts using <code>?caret::createFolds</code> to use as training and testing data for cross validation performance measures. See <a href="#cross-validation">Cross Validation</a> section.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Cross Validation ----</span>
<span class="co"># Each element are the row ids of a 10 fold CV</span>
cv <-<span class="st"> </span><span class="kw">createFolds</span>(<span class="dv">1</span><span class="op">:</span><span class="kw">nrow</span>(wnk),
<span class="dt">k =</span> <span class="dv">10</span>,
<span class="dt">list =</span> <span class="ot">TRUE</span>,
<span class="dt">returnTrain =</span> <span class="ot">TRUE</span>)</code></pre></div>
<p><strong>A6. Learners</strong></p>
<p>Learners were constructed using <code>?e1071::naiveBayes</code>, <code>?e1071::svm</code>, and <code>?randomForest::randomForest</code>. These were trained on 9 folds of the feature selected <strong>wnk</strong> bag of words matrix, and tested on the 1 remaining fold. This is done until all folds have been tested for. During the testing of each fold, <code>?caret::precision</code> and <code>?caret::recall</code> is used to calculate the F1 score and stored in a <strong>results</strong> variable. This <strong>results</strong> variable holds the F1 scores for each fold for each algorithm <em>$nbc</em>, <em>$rf</em> and <em>$svm</em>. See <a href="#selected-learners">Selected Learners</a> section.</p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Learners ----</span>
<span class="co"># 1. Set performance measures</span>
precision <-<span class="st"> </span>caret<span class="op">::</span>precision
recall <-<span class="st"> </span>caret<span class="op">::</span>recall
<span class="co"># 2. Results list to collect Predictions</span>
results <-<span class="st"> </span><span class="kw">list</span>(
<span class="dt">svm =</span> <span class="kw">c</span>(),
<span class="dt">rf =</span> <span class="kw">c</span>(),
<span class="dt">nbc =</span> <span class="kw">c</span>()
)
<span class="co"># 3. Predict on 10 fold CV</span>
<span class="cf">for</span> (ids <span class="cf">in</span> cv) {
<span class="co"># 3.1 Training data</span>
xtrain <-<span class="st"> </span>wnk[ids, ]
ytrain <-<span class="st"> </span>yn[ids]
<span class="co"># 3.2 Testing data</span>
xtest <-<span class="st"> </span>wnk[<span class="op">-</span>ids, ]
ytest <-<span class="st"> </span>yn[<span class="op">-</span>ids]
<span class="co"># 3.3 Create models</span>
nbcModel <-<span class="st"> </span><span class="kw">naiveBayes</span>(<span class="dt">x =</span> wnk, <span class="dt">y =</span> yn, <span class="dt">laplace =</span> <span class="dv">1</span>)
svmModel <-<span class="st"> </span><span class="kw">svm</span>(<span class="dt">x =</span> xtrain, <span class="dt">y =</span> ytrain, <span class="dt">kernel =</span> <span class="st">"linear"</span>)
rfModel <-<span class="st"> </span><span class="kw">randomForest</span>(<span class="dt">x =</span> xtrain, <span class="dt">y =</span> ytrain, <span class="dt">ntrees =</span> <span class="dv">500</span>)
<span class="co"># 3.3 Predictions</span>
nbcPredict <-<span class="st"> </span><span class="kw">predict</span>(nbcModel, xtest)
svmPredict <-<span class="st"> </span><span class="kw">predict</span>(svmModel, xtest)
rfPredict <-<span class="st"> </span><span class="kw">predict</span>(rfModel, xtest)
<span class="co"># 3.4 Evaluate Precision</span>
nbcPrecision <-<span class="st"> </span><span class="kw">precision</span>(nbcPredict, ytest)
svmPrecision <-<span class="st"> </span><span class="kw">precision</span>(svmPredict, ytest)
rfPrecision <-<span class="st"> </span><span class="kw">precision</span>(rfPredict, ytest)
<span class="co"># 3.5 Evaluate Recall</span>
nbcRecall <-<span class="st"> </span><span class="kw">recall</span>(nbcPredict, ytest)
svmRecall <-<span class="st"> </span><span class="kw">recall</span>(svmPredict, ytest)
rfRecall <-<span class="st"> </span><span class="kw">recall</span>(rfPredict, ytest)
<span class="co"># 3.6 Evaluate F1 Score</span>
nbcF1 <-<span class="st"> </span>(<span class="dv">2</span> <span class="op">*</span><span class="st"> </span>nbcPrecision <span class="op">*</span><span class="st"> </span>nbcRecall) <span class="op">/</span><span class="st"> </span>(nbcPrecision <span class="op">+</span><span class="st"> </span>nbcRecall)
svmF1 <-<span class="st"> </span>(<span class="dv">2</span> <span class="op">*</span><span class="st"> </span>svmPrecision <span class="op">*</span><span class="st"> </span>svmRecall) <span class="op">/</span><span class="st"> </span>(svmPrecision <span class="op">+</span><span class="st"> </span>svmRecall)
rfF1 <-<span class="st"> </span>(<span class="dv">2</span> <span class="op">*</span><span class="st"> </span>rfPrecision <span class="op">*</span><span class="st"> </span>rfRecall) <span class="op">/</span><span class="st"> </span>(rfPrecision <span class="op">+</span><span class="st"> </span>rfRecall)
<span class="co"># 3.7 Add F1 score of fold to results</span>
results<span class="op">$</span>nbc <-<span class="st"> </span><span class="kw">c</span>(results<span class="op">$</span>nbc, nbcF1)
results<span class="op">$</span>svm <-<span class="st"> </span><span class="kw">c</span>(results<span class="op">$</span>svm, svmF1)
results<span class="op">$</span>rf <-<span class="st"> </span><span class="kw">c</span>(results<span class="op">$</span>rf, rfF1)
}</code></pre></div>
<p><em>Plot of the Naive Bayes Classifier Results</em></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">plot</span>(results<span class="op">$</span>nbc, <span class="dt">type =</span> <span class="st">'l'</span>, <span class="dt">ylab =</span> <span class="st">"F1 Score"</span>, <span class="dt">xlab =</span> <span class="st">"CV Fold"</span>)</code></pre></div>
<p><img src="index_files/figure-html/appendix-a6a-1.png" width="672" /></p>
<p><em>Plot of the Random Forest Results</em></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">plot</span>(results<span class="op">$</span>rf, <span class="dt">type =</span> <span class="st">'l'</span>, <span class="dt">ylab =</span> <span class="st">"F1 Score"</span>, <span class="dt">xlab =</span> <span class="st">"CV Fold"</span>)</code></pre></div>
<p><img src="index_files/figure-html/appendix-a6b-1.png" width="672" /></p>
<p><em>Plot of the Support Vector Machine Results</em></p>
<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">plot</span>(results<span class="op">$</span>svm, <span class="dt">type =</span> <span class="st">'l'</span>, <span class="dt">ylab =</span> <span class="st">"F1 Score"</span>, <span class="dt">xlab =</span> <span class="st">"CV Fold"</span>)</code></pre></div>
<p><img src="index_files/figure-html/appendix-a6c-1.png" width="672" /></p>
</div>
<div class="footnotes">
<hr />
<ol>
<li id="fn1"><p>In the table, (tab) indicates that a tab character was present in the data sample.<a href="#fnref1">↩</a></p></li>
<li id="fn2"><p>For clarification, a double indicates type numeric and a character indicates type string.<a href="#fnref2">↩</a></p></li>
<li id="fn3"><p>NLP seeks to extract meaning from textual data involving language communication with low level tasks such as identification of individual words and high level tasks such as spelling correction <span class="citation">(Nadkarni, Ohno-Machado, and Chapman 2011)</span><a href="#fnref3">↩</a></p></li>
<li id="fn4"><p>A supervised classification, in this case, refers to the target sentiment classes being known, and the target sentiment values being discrete rather than continuous or numeric in meaning<a href="#fnref4">↩</a></p></li>
<li id="fn5"><p>For example, the text instance “hello goodbye now”, contain 3 words “hello”, “goodbye”, and “now” that are separated by spaces<a href="#fnref5">↩</a></p></li>
<li id="fn6"><p>Word lengths and text lengths are measured in the number of characters, excluding spaces and symbols that define punctuation, which are more specifically the number of alphanumeric characters in this case (e.g. the word “apple” has a length of 5 alphanumeric characters and the text “!*–" has a length of 0 characters containing non-alphanumeric characters)<a href="#fnref6">↩</a></p></li>
<li id="fn7"><p>Features in this report are similar to attributes, except that features refer to the machine-constructed columns to differ from the original sentence and sentiment attributes<a href="#fnref7">↩</a></p></li>
<li id="fn8"><p>Stop words are commonly used words in the language that are often, but not always, removed as they may not hold useful information, but there is a possibility that stop words may be useful<a href="#fnref8">↩</a></p></li>
</ol>
</div>
</section>
</div>
</div>
</div>
</div>
</div>
<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
<script>
require(["gitbook"], function(gitbook) {
gitbook.start({
"sharing": {
"github": false,
"facebook": true,
"twitter": true,
"google": false,
"weibo": false,
"instapper": false,
"vk": false,
"all": ["facebook", "google", "twitter", "weibo", "instapaper"]
},
"fontsettings": {
"theme": "white",
"family": "sans",
"size": 2
},
"edit": {
"link": null,
"text": null
},
"download": null,
"toc": {
"collapse": "subsection"
},
"search": false
});
});
</script>
<!-- dynamically load mathjax for compatibility with self-contained -->
<script>
(function () {
var script = document.createElement("script");
script.type = "text/javascript";
script.src = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
if (location.protocol !== "file:" && /^https?:/.test(script.src))
script.src = script.src.replace(/^https?:/, '');
document.getElementsByTagName("head")[0].appendChild(script);
})();
</script>
</body>
</html>