Restaurant_Rating_Prediction_YELPS.py

# -*- coding: utf-8 -*-
"""STAT_ML_Restaurant_Rating.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/18DqI46EqWnFQtBJyeuEobZiWrORoOyfv
"""

import pandas as pd

!pip install -q kaggle

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets list

!kaggle datasets download -d yelp-dataset/yelp-dataset

!ls

!unzip \*.zip && rm *.zip

from google.colab import drive
drive.mount('/content/gdrive')

# Commented out IPython magic to ensure Python compatibility.
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/gdrive/MyDrive/kaggle'
# %cd /content/gdrive/MyDrive/kaggle

from time import time
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def load_rows(file_path, nrows=None, only_return_count=False, verbose=True):
    """
    Returns dataframe from json file
    """
    tic = time()
    with open(file_path) as json_file:
        count = 0
        objs = []
        line = json_file.readline()
        while (nrows is None or count<nrows) and line:
            count += 1
            if not only_return_count:
                obj = json.loads(line)
                objs.append(obj)
            line = json_file.readline()
        toc = time()
        if verbose:
            print(file_path.split('/')[-1], 'loaded. Count =', count, ', Time =', round(toc-tic,2), 'secs.')
        
        if only_return_count:
            return count
        
        return pd.DataFrame(objs)
    
    
#data generator to load data in chunks
def load_rows_gen(file_path, nrows=1e6, verbose=True):
    """
    Returns data in chunks
    """
    with open(file_path) as json_file:
        line = json_file.readline()
        total = 0
        while line:
            count = 0
            objs = []
            tic = time()
            while count<nrows and line:
                count+=1
                obj = json.loads(line)
                objs.append(obj)
                line = json_file.readline()
                total += count
            toc = time()
            print('Loaded chunk of size:', count, ", Time =", round(toc-tic,2), 'secs.')
            yield pd.DataFrame(objs)

"""***LOADING THE DATASET***"""

filename = 'yelp_academic_dataset_business.json'
biz_data = load_rows(filename, 50000)
biz_data.shape

biz_data.head()

biz_data.groupby('state')['business_id'].count()

filename = 'yelp_academic_dataset_review.json'
rev_data = load_rows(filename, 50000)
rev_data.shape

rev_data.head(10)

a = biz_data[biz_data['categories'].str.contains('Restaurant') == True]
rev = rev_data[rev_data.business_id.isin(a['business_id']) == True]

len(rev)

rev.head(10)
rev.shape

"""***VISUALISATION OF THE DATA***"""

sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.countplot(x='City',data=dfs)

labels = rev['stars'].value_counts().index
sizes = rev['stars'].value_counts().values   
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True)
ax1.axis('equal')
plt.show()

from wordcloud import WordCloud
text = rev['text']
plt.subplots(figsize = (10,10))
wordcloud = WordCloud(background_color='black', width=1024, height=768).generate(' '.join(text))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

data = rev[['text','stars']]
data.shape

data.head(10)

data['stars'].value_counts()

"""***ADDING LABELS TO THE DATAFRAME***"""

labels = []
for star in data['stars']:
    if star==1.0 or star==2.0:
        labels.append(-1)
    elif star==4.0 or star==5.0:
        labels.append(1)
    else:
        labels.append(0)
data['new_stars'] = labels

data.head(10)

data = data.drop('stars', 1)
data.columns = ['text', 'stars']

data.head(20)

sample_size = 6000 # Above 20% of the original size
sample = data.sample(sample_size)
sample = sample.reset_index(drop=True)

plot_stars_frequencies(sample)

"""***PREPROCESSING WORK***"""

def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

sample['text'] = sample['text'].apply(remove_punctuation)

import nltk
nltk.download('stopwords')

def remove_stopwords(text):
    from nltk.corpus import stopwords
    '''a function for removing the stopword'''
    sw = stopwords.words('english')
    # removing the stop words and lowercasing the selected words
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    # joining the list of words with space separator
    return " ".join(text)

sample['text'] = sample['text'].apply(remove_stopwords)

def stemming(text): 
    '''a function which stems each word in the given text'''
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)

sample['text'] = sample['text'].apply(stemming)

from sklearn.feature_extraction.text import TfidfVectorizer
# create a count vectorizer object
vectorizer = TfidfVectorizer(dtype=np.float32)
# fit the count vectorizer using the text data
tfidf_matrix = vectorizer.fit_transform(sample['text'].values)

print(len(vectorizer.get_feature_names()))

def get_word_freq(tfidf, vectorizer):
    return sorted([(tfidf.getcol(idx).sum(), word)
                   for word, idx in vectorizer.vocabulary_.items()],
                  reverse=True)

words_tfidf_scores = get_word_freq(tfidf_matrix, vectorizer)

# transform tfidf scores matrix into pandas dataframe
words_tfidf_scores_df = pd.DataFrame(words_tfidf_scores, dtype=np.float32) 
words_tfidf_scores_df.columns = ['tfidf_score', 'token']
words_tfidf_scores_df = words_tfidf_scores_df.set_index('token')

words_tfidf_scores_df.head()

words_tfidf_scores_df.info()
words_tfidf_scores_df.shape

"""***HISTOGRAM OF THE 100 HIGHEST WORDS***"""

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (20,3)

plt.rcParams['figure.figsize'] = (24,20)
plt.rcParams['font.size'] = 22
ax = words_tfidf_scores_df.head(50).plot(kind='barh')
ax.invert_yaxis()

"""***HISTOGRAM OF THE 100 LOWEST WORDS***"""

plt.rcParams['figure.figsize'] = (24,20)
plt.rcParams['font.size'] = 22
ax = words_tfidf_scores_df.tail(50).plot(kind='barh')
ax.invert_yaxis()

def getKey(dict, value):
    '''get dictionnary key from its value'''
    
    return [key for key in dict.keys() if (dict[key] == value)][0]

top_features_size = 1001

top_features_names = words_tfidf_scores_df.index.values[:top_features_size] # Extracting the top 20% features' names

# And their corresponding indices from the 'vectorizer.vocabulary_' dictionnary
top_features_indices = [
    vectorizer.vocabulary_.get(top_features_names[i]) for i in range(len(top_features_names))
]

# Creating the final dataframe from the tfidf_matrix with respect to the top features
final_df = pd.DataFrame()

for top_feature_index in top_features_indices:
    final_df[getKey(vectorizer.vocabulary_, top_feature_index)] = pd.Series(tfidf_matrix.getcol(top_feature_index).toarray().ravel())

final_df['stars'] = sample['stars']

final_df.head()

plt.rcParams["figure.figsize"] = (12,10)

plot_stars_frequencies(final_df)

def pretty_print_cm(cm, labels, normalize=False):
    """pretty print confusion matrices"""
    import seaborn as sns
    from sklearn.preprocessing import scale
    
    title = 'Confusion matrix'
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title = ' '.join(['Normalized', title])
    
    conf_mat = pd.DataFrame(cm, columns=labels, index=labels)
    conf_mat.index.name = 'True \ Predicted'
    plt.figure(figsize=(16,10))
    ax = sns.heatmap(conf_mat, annot=True, cmap='Blues', fmt='g')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    ax.set_title(title)
    ax.margins(2,2)
    plt.show()

def generate_prediction_results(y_true, y_pred):
    """generate classification report and confusion matrix on a given true and predicted values"""
    from sklearn.metrics import classification_report, confusion_matrix

    conf_mat = confusion_matrix(y_true, y_pred)
    class_report = classification_report(y_true, y_pred, output_dict=True)
    return conf_mat, class_report

from sklearn.model_selection import train_test_split

X = final_df.drop(['stars'], 1)
y = final_df['stars']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

pd.DataFrame(y_train.value_counts()).plot(kind='bar')

"""***BALANCING THE DATASET***"""

from imblearn.over_sampling import SMOTE  #Synthetic Minority OverSampling

sm = SMOTE()
x_cols = X_train.columns
X_train, y_train = sm.fit_sample(X_train, y_train)
X_train = pd.DataFrame(X_train, columns=x_cols)
y_train = pd.Series(y_train)

pd.DataFrame(y_train.value_counts()).plot(kind='bar')

"""***NAIVE BAYES***"""

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)
NB_predictions_test = clf.predict(X_test)
NB_predictions_train = clf.predict(X_train)

NB_confusion_matrix_test, NB_classification_report_test = generate_prediction_results(y_test, NB_predictions_test)
NB_confusion_matrix_train, NB_classification_report_train = generate_prediction_results(y_train, NB_predictions_train)

labels = [-1, 0, 1]
pretty_print_cm(NB_confusion_matrix_train, labels)

pd.DataFrame(NB_classification_report_train)

labels = [-1, 0, 1]
pretty_print_cm(NB_confusion_matrix_test, labels)

print('Classification report: \n')
pd.DataFrame(NB_classification_report_test)

"""***MULTINOMIAL NAIVE BAYES***"""

from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train, y_train)
MNB_predictions_train = clf.predict(X_train)
MNB_predictions_test = clf.predict(X_test)

MNB_confusion_matrix_train, MNB_classification_report_train = generate_prediction_results(y_train, MNB_predictions_train)

"""***CONFUSION MATRIX FOR TRAIN DATASET***"""

labels = [-1, 0, 1]
pretty_print_cm(MNB_confusion_matrix_train, labels)

pd.DataFrame(MNB_classification_report_train)

MNB_confusion_matrix_test, MNB_classification_report_test = generate_prediction_results(y_test, MNB_predictions_test)

"""***CONFUSION MATRIX ON TEST-SET***"""

# Confusion matrix
labels = [-1, 0, 1]
pretty_print_cm(MNB_confusion_matrix_test, labels)

pd.DataFrame(MNB_classification_report_test)