-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreall.py
154 lines (134 loc) · 6.76 KB
/
preall.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import pandas as pd
import sqlite3
import concurrent.futures
import time
import re
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer
# Initialize Sastrawi stopword remover
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()
# Initialize Sastrawi stemmer
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()
# Porter
porter_stemmer = PorterStemmer()
def preprocess_text(text):
start_time = time.time()
if not isinstance(text, str):
return ''
# clean data
content_cleaned = re.sub(r'ADVERTISEMENT', '', text)
content_cleaned=content_cleaned.replace("JAKARTA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("DEPOK, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BADUNG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("DEPOK, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BALI, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BALIKPAPAN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BANDUNG BARAT, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BANDUNG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BANTEN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BANYUWANGI, KOMPAS.comm", "")
content_cleaned=content_cleaned.replace("BATANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BEKASI, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BOGOR KOMPAS.com", "")
content_cleaned=content_cleaned.replace("BOYOLALI, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("CILEGON, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("CIREBON, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("DEPOK, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("GRESIK, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MADIUN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MAGELANG KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MAGETAN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MALANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MAKASSAR, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MEDAN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("MERAUKE, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PADANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PALU, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PURBALINGGA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PURWOKERTO, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("PURWOREJO, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("SURABAYA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("SURAKARTA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("TANGERANG SELATAN, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("TANGERANG, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("TASIKMALAYA, KOMPAS.com", "")
content_cleaned=content_cleaned.replace("TANGERANG, KOMPAS.com", "")
content_cleaned = re.sub(r'Detik News', '', content_cleaned)
content_cleaned = re.sub(r'CNN News', '', content_cleaned)
content_cleaned = re.sub(r'KOMPAS.com', '', content_cleaned)
content_cleaned = re.sub(r'Kompas News', '', content_cleaned)
content_cleaned = re.sub(r'Gambas', '', content_cleaned)
content_cleaned = re.sub(r'20detik', '', content_cleaned)
content_cleaned = re.sub(r'berikutnya', '', content_cleaned)
content_cleaned = re.sub(r'halaman', '', content_cleaned)
content_cleaned = re.sub(r'detikcom', '', content_cleaned)
content_cleaned = re.sub(r'Halaman', '', content_cleaned)
# Menghapus data karalter
content_cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', content_cleaned)
# Case folding
content_cleaned = content_cleaned.lower()
# Stopwords removal
content_without_stopwords = stopword_remover.remove(content_cleaned)
# Stemming
# stemmed_text = stemmer.stem(content_without_stopwords)
# Tokenization
tokens = word_tokenize(content_without_stopwords)
# Stemming
stemmed_words = [porter_stemmer.stem(token) for token in tokens]
# Join the processed tokens back into text
processed_text = ' '.join(tokens)
end_time = time.time()
print(f"Total time taken: {end_time - start_time} seconds for preprocess_text")
return processed_text
def insert_into_sqlite(url, content, sentimen,title,tanggal,nama_berita):
conn = sqlite3.connect('prepro.db')
cursor = conn.cursor()
try:
cursor.execute('''INSERT INTO pre_all (url_berita,content,sentimen,title,tanggal_berita,nama_berita)
VALUES (?, ?, ?,?,?,?)''', (url, content, sentimen,title,tanggal,nama_berita))
conn.commit()
except sqlite3.IntegrityError:
print(f"URL '{url}' sudah ada dalam database.")
finally:
conn.close()
def get_processed_urls():
conn = sqlite3.connect('prepro.db')
cursor = conn.cursor()
cursor.execute('''SELECT url_berita FROM pre_all''')
processed_urls = set(row[0] for row in cursor.fetchall())
conn.close()
return processed_urls
def preprocess_and_save_text(row, processed_urls, url_count):
url = row['link_berita']
# Mengecek apakah teks sudah diproses sebelumnya berdasarkan URL
if url in processed_urls:
print(f"Skip: {url} already processed")
return
# Jika teks belum diproses, lakukan preprocessing
content = row['content']
processed_text = preprocess_text(content)
# Simpan hasil ke dalam SQLite
insert_into_sqlite(url, processed_text,row['sentimen'],row['title'],row['tanggal_berita'],row['nama_berita'])
print(f"Processed: {url}")
# Meningkatkan jumlah URL yang telah diproses
url_count.append(url)
def preprocess_texts_parallel(data):
start_time = time.time()
processed_urls = get_processed_urls()
url_count = []
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
executor.map(preprocess_and_save_text, data, [processed_urls]*len(data), [url_count]*len(data))
except KeyboardInterrupt:
print("Keyboard interrupt received. Shutting down gracefully...")
executor.shutdown(wait=False)
end_time = time.time()
print(f"Total waktu yang dibutuhkan: {end_time - start_time} detik")
print(f"Total URL yang telah diproses: {len(url_count)}")
if __name__ == "__main__":
dataset = pd.read_csv('raw_data_all.csv')
preprocess_texts_parallel(dataset.to_dict('records'))