-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_all.py
120 lines (108 loc) · 5.8 KB
/
clean_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
import concurrent.futures
import time
import re
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer
# Initialize Sastrawi stopword remover
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()
# Initialize Sastrawi stemmer
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()
# Porter
porter_stemmer = PorterStemmer()
def preprocess_text(text):
start_time = time.time()
if not isinstance(text, str):
return ''
# clean data
content_cleaned = re.sub(r'ADVERTISEMENT', '', text)
content_cleaned=content_cleaned.replace("JAKARTA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("DEPOK, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BADUNG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("DEPOK, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BALI, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BALIKPAPAN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BANDUNG BARAT, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BANDUNG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BANTEN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BANYUWANGI, KOMPAS.comm", "")
content_cleaned=content_cleaned.replace("BATANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BEKASI, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BOGOR KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BOYOLALI, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("CILEGON, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("CIREBON, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("DEPOK, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("GRESIK, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MADIUN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MAGELANG KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MAGETAN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MALANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MAKASSAR, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MEDAN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MERAUKE, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PADANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PALU, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PURBALINGGA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PURWOKERTO, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PURWOREJO, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("SURABAYA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("SURAKARTA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("TANGERANG SELATAN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("TANGERANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("TASIKMALAYA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("TANGERANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("ACEH, KOMPAS.com", "")
content_cleaned = re.sub(r'Detik News', '', content_cleaned)
content_cleaned = re.sub(r'CNN News', '', content_cleaned)
content_cleaned = re.sub(r'KOMPAS.com', '', content_cleaned)
content_cleaned = re.sub(r'Kompas News', '', content_cleaned)
content_cleaned = re.sub(r'Gambas', '', content_cleaned)
content_cleaned = re.sub(r'20detik', '', content_cleaned)
content_cleaned = re.sub(r'berikutnya', '', content_cleaned)
content_cleaned = re.sub(r'halaman', '', content_cleaned)
content_cleaned = re.sub(r'detikcom', '', content_cleaned)
content_cleaned = re.sub(r'Halaman', '', content_cleaned)
# Menghapus data karalter
content_cleaned = re.sub(r'[^a-zA-Z0-9\s]+', ' ', content_cleaned)
# Case folding
content_cleaned = content_cleaned.lower()
# Stopwords removal
content_without_stopwords = stopword_remover.remove(content_cleaned)
# Stemming
# stemmed_text = stemmer.stem(content_without_stopwords)
# Tokenization
tokens = word_tokenize(content_without_stopwords)
# # Stemming
# stemmed_words = [porter_stemmer.stem(token) for token in tokens]
# Join the processed tokens back into text
processed_text = ' '.join(tokens)
end_time = time.time()
print(f"Total time taken: {end_time - start_time} seconds for preprocess_text")
return processed_text
def preprocess_and_save_text(row):
content = row['content']
processed_text = preprocess_text(content)
return processed_text, row['title'],row['content'],row['tanggal_berita'],row['link_berita'],row['asal_berita']
def preprocess_texts_parallel(data):
start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
results = list(executor.map(preprocess_and_save_text, data))
processed_texts, title, content,tanggal_berita,link_berita,asal_berita= zip(*results)
df = pd.DataFrame({'title': title,
'content': content,
'tanggal_berita':tanggal_berita,
'content_clean': processed_texts,
'link_berita':link_berita,
'asal_berita':asal_berita
})
df.to_csv('online_news_50000_clean_all.csv', index=False)
end_time = time.time()
print(f"Total waktu yang dibutuhkan: {end_time - start_time} detik")
if __name__ == "__main__":
dataset = pd.read_csv('online_news_50000_all.csv')
preprocess_texts_parallel(dataset.to_dict('records'))