-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleaning.py
135 lines (109 loc) · 6.81 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
from time import sleep # control the crawl rate to avoid hammering the servers with too many requests
from random import randint
from tqdm import tqdm
import re
import datetime
from google.cloud import storage
class cleaning_review_data():
def __init__(self, filename: str) -> None:
self.data = pd.read_csv(f'data_full_review/{filename}')
self.data = self.data.dropna(axis = 0, subset=['reviewer_feature', 'reviewer_id', 'rating', 'date_of_review', 'review_content'], how='any')
self.data = self.data.reset_index()
self.filename = filename
self.storage_client = storage.Client.from_service_account_json('foundation-matching-9bb2587b610a.json')
def parsing_reviewer_features(self):
self.data['eye_color'] = ''
self.data['hair_color'] = ''
self.data['skin_tone'] = ''
self.data['skin_type'] = ''
self.data['skin_tone_bin'] = 0 # darker skin tone as 0, lighter as 1
for i in tqdm(range(len(self.data))):
eye = re.search(r'.+eyes', self.data.loc[i, 'reviewer_feature'])
if eye:
self.data.loc[i, 'eye_color'] = eye.group()
self.data.loc[i, 'eye_color'] = self.data.loc[i, 'eye_color'].replace(' eyes', '')
self.data.loc[i, 'reviewer_feature'] = self.data.loc[i, 'reviewer_feature'].replace(eye.group(), '')
hair = re.search(r'.+hair', self.data.loc[i, 'reviewer_feature'])
if hair:
self.data.loc[i, 'hair_color'] = hair.group()
self.data.loc[i, 'hair_color'] = self.data.loc[i, 'hair_color'].replace(' hair', '').replace(', ', '')
self.data.loc[i, 'reviewer_feature'] = self.data.loc[i, 'reviewer_feature'].replace(hair.group(), '')
skin_tone = re.search(r'.+skin tone', self.data.loc[i, 'reviewer_feature'])
if skin_tone:
self.data.loc[i, 'skin_tone'] = skin_tone.group()
self.data.loc[i, 'skin_tone'] = self.data.loc[i, 'skin_tone'].replace(' skin tone', '').replace(', ', '')
self.data.loc[i, 'reviewer_feature'] = self.data.loc[i, 'reviewer_feature'].replace(skin_tone.group(), '')
skin_type = re.search(r'.+skin$', self.data.loc[i, 'reviewer_feature'])
if skin_type:
self.data.loc[i, 'skin_type'] = skin_type.group().replace(', ', '')
self.data.loc[i, 'skin_type'] = self.data.loc[i, 'skin_type'].replace(' skin', '')
if self.data.loc[i, 'skin_tone'] == 'Dark' or self.data.loc[i, 'skin_tone'] == 'Ebony' or self.data.loc[i, 'skin_tone'] == 'Deep':
self.data.loc[i, 'skin_tone_cat'] = 0 # darker skin tone = 0
else:
self.data.loc[i, 'skin_tone_cat'] = 1 # ligher skin tone = 1
self.data = self.data.dropna(axis = 0, subset=['skin_tone', 'skin_type', 'hair_color', 'eye_color'], how='any')
self.data = self.data[self.data['skin_tone']!='']
self.data = self.data[self.data['skin_type']!='']
self.data = self.data[self.data['eye_color']!='']
self.data = self.data[self.data['hair_color']!='']
self.data = self.data.reset_index(drop = True)
return self.data
def parsing_date_of_review(self):
for i in self.data.index:
if re.match('.*ago*.', self.data.loc[i, 'date_of_review']):
self.data.loc[i, 'date_of_review'] = datetime.date.today() - datetime.timedelta(int(re.findall('([\s\d]+)', self.data.loc[i, 'date_of_review'])[0]))
self.data['date_of_review'] = pd.to_datetime(self.data['date_of_review'])
self.data['days_since_launch'] = self.data['date_of_review'] - min(self.data['date_of_review'])
self.data['days_since_launch'] = self.data['days_since_launch'].dt.days
self.data['days_since_launch_scaled'] = self.data['days_since_launch'] / max(self.data['days_since_launch'])
self.data['month_of_purchase'] = pd.DatetimeIndex(self.data['date_of_review']).month
self.data = self.data.dropna(axis=0, subset=['days_since_launch', 'days_since_launch_scaled', 'month_of_purchase'], how='any')
return self.data
def parsing_review_content(self):
self.data['finish'] = 0
self.data['coverage'] = 0
self.data['shade_match'] = 0
self.data['gifted'] = 0
indices = self.data.index
for i in indices:
if re.match(r'.*coverage|cover*.', self.data.loc[i, 'review_content']):
if self.data.loc[i, 'rating'] >= 4:
self.data.loc[i, 'coverage'] = 1
for i in indices:
if re.match(r'.*finish|matte|natural*.', self.data.loc[i, 'review_content']):
if self.data.loc[i, 'rating'] >= 4:
self.data.loc[i, 'finish'] = 1
for i in indices:
if re.match(r'.*shade|match*.', self.data.loc[i, 'review_content']):
if self.data.loc[i, 'rating'] >= 4:
self.data.loc[i, 'shade_match'] = 1
for i in indices:
if re.match(r'.*gifted|receive|incentivize|receid|compliment*.', self.data.loc[i, 'review_content']):
if self.data.loc[i, 'rating'] >= 4:
self.data.loc[i, 'gifted'] = 1
return self.data
def to_pickle(self):
self.filename = self.filename.replace('.csv', '')
cols = ['reviewer_id', 'rating', 'recommended', 'review_subject',
'review_content', 'reviewer_feature', 'purchased_shade',
'date_of_review', 'eye_color', 'hair_color', 'skin_tone', 'skin_type',
'skin_tone_bin', 'skin_tone_cat', 'days_since_launch',
'days_since_launch_scaled', 'month_of_purchase', 'finish', 'coverage',
'shade_match', 'gifted']
self.data = self.data[cols]
return self.data.to_pickle(path=f'data_full_review_cleaned/{self.filename}.pkl')
def to_json(self):
self.filename = self.filename.replace('.csv', '')
cols = ['reviewer_id', 'rating', 'recommended', 'review_subject',
'review_content', 'reviewer_feature', 'purchased_shade',
'date_of_review', 'eye_color', 'hair_color', 'skin_tone', 'skin_type',
'skin_tone_bin', 'skin_tone_cat', 'days_since_launch',
'days_since_launch_scaled', 'month_of_purchase', 'finish', 'coverage',
'shade_match', 'gifted']
self.data = self.data[cols]
return self.data.to_json(f'data_full_review_cleaned/{self.filename}.json', orient='records', lines=True)
def to_gcs(self):
bucket = self.storage_client.get_bucket('foundation_reviews')
blob = bucket.blob(f'{self.filename}.json')
blob.upload_from_filename(f'data_full_review_cleaned/{self.filename}.json')