-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTextPreprocessor.cpp
57 lines (42 loc) · 1.47 KB
/
TextPreprocessor.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/*
* TextPreprocessor.cpp
*
* Created on: 28 feb. 2017
* Author: louis
*/
#include "TextPreprocessor.h"
#include <algorithm>
namespace SLM {
std::vector<std::string> CGNTextPreprocessor::fillers ={"ggg", "ah", "bah", "goh", "ha", "hej", "h�", "joh", "mm-hu", "oeh", "oh", "pff", "tiens", "tja", "uh", "wauw", "weh", "zuh", "zulle", "<UNK>", "<sil>"};
TextPreprocessor::TextPreprocessor() {
// TODO Auto-generated constructor stub
}
TextPreprocessor::~TextPreprocessor() {
// TODO Auto-generated destructor stub
}
CGNTextPreprocessor::CGNTextPreprocessor() {
}
CGNTextPreprocessor::~CGNTextPreprocessor() {
// TODO Auto-generated destructor stub
}
bool CGNTextPreprocessor::isFiller(const std::string& w)
{
auto it = std::find(SLM::CGNTextPreprocessor::fillers.begin(), SLM::CGNTextPreprocessor::fillers.end(), w);
return it != SLM::CGNTextPreprocessor::fillers.end();
}
std::vector<std::string> CGNTextPreprocessor::removeFillers(std::vector<std::string>& words, bool removeMarkers)
{
words.erase(std::remove_if(words.begin(),
words.end(),
[](const std::string& x){return SLM::CGNTextPreprocessor::isFiller(x);}),
words.end());
if(removeMarkers)
{
words.erase(std::remove_if( words.begin(),
words.end(),
[](const std::string& x){return x == "<s>" || x == "</s>";}),
words.end());
}
return words;
}
} /* namespace SLM */