-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprecontent.py
116 lines (98 loc) · 4.15 KB
/
precontent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
import sqlite3
import concurrent.futures
import time
import re
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer
# Initialize Sastrawi stopword remover
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()
# Initialize Sastrawi stemmer
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()
# Porter
porter_stemmer = PorterStemmer()
def preprocess_text(text):
start_time = time.time()
if not isinstance(text, str):
return ''
# clean data
content_cleaned = re.sub(r'ADVERTISEMENT', '', text)
content_cleaned = re.sub(r'Detik News', '', content_cleaned)
content_cleaned = re.sub(r'CNN News', '', content_cleaned)
content_cleaned = re.sub(r'KOMPAS.com', '', content_cleaned)
content_cleaned = re.sub(r'Kompas News', '', content_cleaned)
content_cleaned = re.sub(r'JAKARTA, KOMPAS.com', '', content_cleaned)
content_cleaned = re.sub(r'SEMARANG, KOMPAS.com', '', content_cleaned)
content_cleaned = re.sub(r'Gambas', '', content_cleaned)
content_cleaned = re.sub(r'20detik', '', content_cleaned)
content_cleaned = re.sub(r'berikutnya', '', content_cleaned)
content_cleaned = re.sub(r'halaman', '', content_cleaned)
# Menghapus data karalter
content_cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', content_cleaned)
# Case folding
content_cleaned = content_cleaned.lower()
# Stopwords removal
content_without_stopwords = stopword_remover.remove(content_cleaned)
# Stemming
# stemmed_text = stemmer.stem(content_without_stopwords)
# Tokenization
tokens = word_tokenize(content_without_stopwords)
# # Stemming
stemmed_words = [porter_stemmer.stem(token) for token in tokens]
# Join the processed tokens back into text
processed_text = ' '.join(stemmed_words)
end_time = time.time()
print(f"Total time taken: {end_time - start_time} seconds for preprocess_text")
return processed_text
def insert_into_sqlite(url, content, sentimen):
conn = sqlite3.connect('prepro.db')
cursor = conn.cursor()
try:
cursor.execute('''INSERT INTO pre_content (url_berita,content,sentimen)
VALUES (?, ?, ?)''', (url, content, sentimen))
conn.commit()
except sqlite3.IntegrityError:
print(f"URL '{url}' sudah ada dalam database.")
finally:
conn.close()
def get_processed_urls():
conn = sqlite3.connect('prepro.db')
cursor = conn.cursor()
cursor.execute('''SELECT url_berita FROM pre_content''')
processed_urls = set(row[0] for row in cursor.fetchall())
conn.close()
return processed_urls
def preprocess_and_save_text(row, processed_urls, url_count):
url = row['link_berita']
# Mengecek apakah teks sudah diproses sebelumnya berdasarkan URL
if url in processed_urls:
print(f"Skip: {url} already processed")
return
# Jika teks belum diproses, lakukan preprocessing
content = row['content']
processed_text = preprocess_text(content)
# Simpan hasil ke dalam SQLite
insert_into_sqlite(url, processed_text,row['sentimen'])
print(f"Processed: {url}")
# Meningkatkan jumlah URL yang telah diproses
url_count.append(url)
def preprocess_texts_parallel(data):
start_time = time.time()
processed_urls = get_processed_urls()
url_count = []
try:
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
executor.map(preprocess_and_save_text, data, [processed_urls]*len(data), [url_count]*len(data))
except KeyboardInterrupt:
print("Keyboard interrupt received. Shutting down gracefully...")
executor.shutdown(wait=False)
end_time = time.time()
print(f"Total waktu yang dibutuhkan: {end_time - start_time} detik")
print(f"Total URL yang telah diproses: {len(url_count)}")
if __name__ == "__main__":
dataset = pd.read_csv('raw_content_sentimen.csv')
preprocess_texts_parallel(dataset.to_dict('records'))