-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_manually_labeled_further.py
103 lines (78 loc) · 3.77 KB
/
clean_manually_labeled_further.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
import re
import string
# Function to clean the text (convert to lowercase, remove digits, punctuation, emojis, and links)
def clean_text(text):
# Ensure the input is a string before processing
if not isinstance(text, str):
return '' # Return an empty string if the text is not a valid string (e.g., NaN or float)
# Convert to lowercase
text = text.lower()
# Remove digits
text = re.sub(r'\d+', '', text)
# Remove punctuation
text = re.sub(rf"[{string.punctuation}]", "", text)
# Remove links (URLs)
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
# Remove emojis using a more comprehensive regex
emoji_pattern = re.compile(
"[" # Emoticons
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U00002702-\U000027B0" # Dingbats
"\U000024C2-\U0001F251" # Enclosed characters
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text) # Remove emojis
# Remove non-ASCII characters
text = re.sub(r'[^\x00-\x7F]+', '', text) # Removes non-ASCII characters
return text
# Load the dataset
file_path = "new2.csv"
comments = pd.read_csv(file_path)
# Check if 'Text' column exists
if 'Text' not in comments.columns:
print("Error: 'Text' column not found in the dataset.")
else:
# 1. Remove duplicate rows based on the 'Text' column
comments = comments.drop_duplicates(subset='Text', keep='first')
# 2. Remove rows where the 'Text' column contains "I am a bot" (case-insensitive)
comments = comments[~comments['Text'].str.contains(r'i am a bot', case=False, na=False)]
# 3. Clean the text in the 'Text' column using the clean_text function
comments['cleaned_text'] = comments['Text'].apply(clean_text)
# 4. Check the cleaned text for any remaining unwanted characters
# If needed, you can print a few rows to verify.
print("Sample of cleaned text:")
print(comments[['Text', 'cleaned_text']].head())
# Drop the original 'Text' column and keep 'cleaned_text' and 'Sentiment'
df_cleaned = comments.drop(columns=['Text'])
# Ensure only 'cleaned_text' and 'Sentiment' columns are kept
df_cleaned = df_cleaned[['cleaned_text', 'Sentiment']]
df_cleaned['Sentiment'] = df_cleaned['Sentiment'].str.lower()
# Save the cleaned dataset to a new CSV file
df_cleaned.to_csv("new2_clean.csv", index=False)
print(df_cleaned)
print(df_cleaned.describe())
print("Original 'Text' column removed. Processed file saved as 'new_comments.csv'.")
df_binary = df_cleaned[df_cleaned['Sentiment'] != 'neutral']
df_binary = df_binary[df_cleaned['Sentiment'] != '']
df_binary.to_csv("new2_clean_noneutral.csv", index=False)
print(df_binary['Sentiment'].value_counts())
print(df_binary.describe())
# Split the dataset into negative and positive classes
negative_df = df_binary[df_binary['Sentiment'] == 'negative']
positive_df = df_binary[df_binary['Sentiment'] == 'positive']
# Undersample the negative class to match the number of positive samples
negative_undersampled = negative_df.sample(n=len(positive_df), random_state=42)
# Combine the undersampled negative class with the positive class
balanced_df = pd.concat([negative_undersampled, positive_df])
# Check the new distribution
print(balanced_df['Sentiment'].value_counts())
# Save the balanced dataset
balanced_df.to_csv("new2_balanced_dataset.csv", index=False)