-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessor.py
81 lines (62 loc) · 2.45 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import re
from nltk.corpus import stopwords
class Preprocessor:
# initialize Preprocessor with tweets and PorterStemmer
def __init__(self, tweets, ps):
self.tweets = tweets
self.ps = ps
# start preprocessing
def start(self):
for i, k in enumerate(self.tweets):
self.tweets[i] = " ".join(self.split_into_stem(k)).split()
return self.remove_stop_words(self.tweets)
def split_into_stem(self, message):
return [self.remove_numeric(self.strip_emoji(self.single_character_remove(self.remove_punctuation
(self.remove_hyperlinks
(self.remove_hashtags
(self.remove_username
(self.stem_word(word)))))))) for
word in
message.split()]
# stem the word
def stem_word(self, word):
return self.ps.stem(word)
# Static Methods
# Remove username
@staticmethod
def remove_username(tweet):
return re.sub('@[^\s]+', '', tweet)
# Remove hashtag
@staticmethod
def remove_hashtags(tweet):
return re.sub(r'#[^\s]+', '', tweet)
# Remove link
@staticmethod
def remove_hyperlinks(tweet):
return re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet)
# Remove numeric character
@staticmethod
def remove_numeric(word):
char_list = [char for char in word if not char.isdigit()]
return "".join(char_list)
# Remove punctuation
@staticmethod
def remove_punctuation(tweet):
return re.sub(r'[^\w\s]', '', tweet)
# Remove single character
@staticmethod
def single_character_remove(tweet):
return re.sub(r'(?:^| )\w(?:$| )', ' ', tweet)
# Remove emoji
@staticmethod
def strip_emoji(text):
RE_EMOJI = re.compile('[\U00010000-\U0010ffff]', flags=re.UNICODE)
return RE_EMOJI.sub(r'', text)
@staticmethod
def remove_stop_words(tweet_list):
filtered_words = []
stop_words = stopwords.words('english')
for i in tweet_list:
filtered_sentence = [w for w in i if not w in stop_words]
filtered_words.append(" ".join(filtered_sentence))
return filtered_words