-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_inspection.py
186 lines (151 loc) · 5.85 KB
/
data_inspection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
# %%
"""
# VISUM 2021 - data inspection
The dataset of the VISUM 2021 project comprises two main `.csv` tables, namely:
### 1. Outfits table
The Outfits table relates every outfit along with the corresponding set of products that belong to it. It is available at the `df_outfits.csv` file and contains 3 columns:
- `outfit_id`: the outfit id;
- `main_product_id`: the main product id, representing the anchor product in the outfit;
- `outfit_products`: the set of product ids that belong to the outfit.
### 2. Products table
The Products table relates every product present in the Outfits' table along with the required product information (i.e. product name, category, and description). It is available at the `df_products.csv` file and contains 4 columns:
- `productid`: the product id;
- `productname`: the product name;
- `category`: the product category;
- `description`: the product description.
"""
"""
## Imports block
"""
import os
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from textwrap import wrap
from prettytable import PrettyTable
from tqdm import tqdm
"""
## Global variables block
"""
DATA_DIR = '../master/dataset/train/'
DF_OUTFITS_FN = 'df_outfits.csv'
DF_PRODUCTS_FN = 'df_products.csv'
IMAGES_DIR = os.path.join(DATA_DIR, 'product_images')
SAVE_PATH = 'inspection_images/'
if not os.path.isdir(SAVE_PATH):
os.makedirs(SAVE_PATH)
extensions = [".jpg", ".jpeg", ".png"]
"""
## Functions definition block
"""
def from_np_array(array_string):
''' Convert a array_string to numpy array
Args:
array_string (str): array_string.
Returns:
(numpy.array): numpy array.
'''
array_string = ','.join(array_string.replace('[ ', '[').split())
return np.array(ast.literal_eval(array_string))
def display_outfit(outfit_id, df_outfits, df_products, imgs_dir):
''' Display all product's information of a given outfit (outfit_id)
Args:
outfit_id (int): outfit id.
df_outfits (pandas.dataframe): outfits dataframe.
df_products (pandas.dataframe): products dataframe.
imgs_dir (str): products images directory.
Returns:
None
'''
# get outfit products
outfit = df_outfits[df_outfits['outfit_id'] == outfit_id]['outfit_products'].iloc[0]
# init outfit table
outfit_table = PrettyTable()
outfit_table.field_names = ["Id", "productname", "category", "description"]
for i, p in enumerate(outfit):
# read product image
im_fn = os.path.join(imgs_dir, str(p) + '.jpg')
im = mpimg.imread(im_fn)
# get product info
prod_info = df_products.loc[df_products['productid'] == p].iloc[0]
# add row to outfit table
desc = wrap(str(prod_info['description']) or '', 65) or ['']
name = wrap(str(prod_info['productname']) or '', 15)
name = name + [''] * (len(desc)-len(name))
outfit_table.add_row([prod_info['productid'], name[0], prod_info['category'], desc[0]])
for subseq_desc, subseq_name in zip(desc[1:], name[1:]):
outfit_table.add_row(['', subseq_name, '', subseq_desc])
# imshow
plt.subplot(1, len(outfit), i+1)
plt.imshow(im)
plt.title(str(p))
plt.axis('off')
plt.savefig(os.path.join(SAVE_PATH, str(outfit_id)+".png"))
outfit_table.align = "l"
print("Outfit ", outfit_id)
print(outfit_table)
def get_min_max_avg(df):
'''
Get max, min and avg values sepecific column in given dataframe
Args:
df (Dataframe): dataframe (df_outifts)
Return:
max (int): max value of items in an outfit
min (int): min value of items in an outfit
avg (int): avg value of items in an outfit
'''
products = []
for idx, row in df.iterrows():
outfit = row['outfit_products']
products.append(len(outfit))
avg = sum(products) / len(products)
return np.max(products), np.min(products), round(avg,1)
"""
## Read `df_outfits` and `df_products` table
"""
# Read df_outfits
df_outfits_fn = os.path.join(DATA_DIR, DF_OUTFITS_FN)
df_outfits = pd.read_csv(df_outfits_fn, converters={'outfit_products': from_np_array}, index_col=0)
print("Outfits:\n", df_outfits)
# Read df_products
df_products_fn = os.path.join(DATA_DIR, DF_PRODUCTS_FN)
df_products = pd.read_csv(df_products_fn, index_col=0)
print("Products:\n", df_products)
"""
## Display Outfits info
"""
outfit_id = 1002569
display_outfit(outfit_id, df_outfits, df_products, IMAGES_DIR)
"""
### Dataset statistics
- Min, Max, and AVG number of products per outfit
- Category distribution (total number of categories, table, bar plot)
"""
#Avg/Min/Max number of products per outfit
max_prods, min_prods, avg_prods = get_min_max_avg(df_outfits)
print(f"max prods per outfit: {max_prods}, min prods per outfit: {min_prods}, avg prods per outfit: {avg_prods}")
#Get total of unique categories
print(f"number of categories: {len(df_products['category'].unique())}")
#Get number of unique products
n_products_unique = len(df_products)
print(f"number of unique products: {n_products_unique}")
#Get number of unique outfits
n_unique_outfits = len(df_outfits)
print(f"number of unique outfits: {n_unique_outfits}")
# %%
#create histogram number_of_products by category
products_count = df_products['category'].value_counts()
plt.figure(figsize=(40, 30))
plot = plt.bar(products_count.index, products_count.values)
plt.title('Frequency Distribution of products by categories')
plt.ylabel('Number of Products', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.xticks(rotation=90, fontsize=15)
plt.savefig(os.path.join(SAVE_PATH, "categories.png"))
# %%
# distribution of products per category
df = pd.DataFrame(data= {'category': list(products_count.index), 'n_products': list(products_count.values)})
pd.set_option('display.max_rows', None)
print(df)