This repository has been archived by the owner on May 29, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
tweetCleaner.py
92 lines (71 loc) · 2.14 KB
/
tweetCleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
'''
Python 3.6
This script contains functions to clean the text in the tweets.
Methods here are not called directly.
Instead, they are called from either "NLTK_clean_tweet_testing.py" or "TextBlob_clean_tweet_testing.py"
'''
print("Importing tweetCleaner...")
from bs4 import BeautifulSoup
import re
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
"""
Returns a list of stopwords called StopWordList.
The file containing the stopwords is titled "stopwords.txt".
"""
def StopWordListCreator():
StopWordList = []
with open("stopwords.txt","r",encoding="utf-8") as stopwords:
for stopword in stopwords.readlines():
StopWordList.append(stopword[:-1])
return StopWordList
def StopWordRemover(tweet):
'''
Removes all stopwords in the tweet, w.r.t. the StopWordList created above.
'''
tweet_words = tweet.split()
new_tweet = []
for word in tweet_words:
if word in StopWordListCreator():
pass
else:
new_tweet.append(word)
return (" ").join(new_tweet)
def lowercase(tweet):
'''
Returns the tweet in lowercase.
'''
return tweet.lower()
def removeSpecialChars(tweet):
'''
Removes special characters which are specifically found in tweets.
'''
#Converts HTML tags to the characters they represent
soup = BeautifulSoup(tweet, "html.parser")
tweet = soup.get_text()
#Convert www.* or https?://* to empty strings
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
#Convert @username to empty strings
tweet = re.sub('@[^\s]+','',tweet)
#Remove additional white spaces
tweet = re.sub('[\s]+', ' ', tweet)
#Replace #word with word
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
#Trims the tweet
tweet = tweet.strip('\'"')
return tweet
def removeAllNonAlpha(tweet):
'''
Remove all characters which are not alphabets, numbers or whitespaces.
'''
tweet = re.sub('[^A-Za-z0-9 ]+','', tweet)
return tweet
def lemmatizer(tweet):
'''
Attempts to replace every individual word with it's root word.
'''
word_list = []
for word in tweet.split():
word_list.append(wordnet_lemmatizer.lemmatize(word))
return (" ".join(word_list))
print("Finished importing tweetCleaner.")