_NBPO.py

## Noise robust Bayesian Point-wise Optimization (NBPO)
## Wenhui Yu 2020.04.26
## author @Wenhui Yu, yuwh16@mails.tsinghua.edu.cn

## Parameters setting
dataset = 0                         # datasets, 0 for Amazon, 1 for movielens
eta = [0.05, 0.02][dataset]         # learning rate
lambda_r = [1, 0.5][dataset]        # coefficient of regularization term
lambda_phi = [5, 1][dataset]        # coefficient of regularization term
K0 = 50                             # length of latent factors
K1 = 50                             # length of latent factors for probability
vali_test = 0                       # 0 for validation set, 1 for test set
sample_rate = 4                     # sample rate, the number of negative samples foreach positive one
batch_size = 5000                   # batch size
epoch = 200                         # number of epochs
top_k = [2, 5, 10, 20, 50, 100]     # top k items to recommend
Model = 'NBPO'

## Paths to save and read
# list for datasets
dataset_list = ['amazon', 'movielens']
path_train = 'dataset\\' + dataset_list[dataset] + '\\train_data.json'
path_validation = 'dataset\\' + dataset_list[dataset] + '\\validation_data.json'
path_test = 'dataset\\' + dataset_list[dataset] + '\\test_data.json'

import numpy as np
from numpy import *
import xlwt
import time
from Library import readdata
from Library import evaluation_F1
from Library import evaluation_NDCG
from Library import save_parameters
from Library import save_df

import xlrd, xlwt
from xlutils.copy import copy as xl_copy
from openpyxl import load_workbook
from openpyxl import Workbook
import pandas as pd

def d(x):
    # sigmoid(x)
    if x > 10:
        return 1
    if x < -10:
        return 0
    if x >= -10 and x <= 10:
        return 1.0 / (1.0 + exp(-x))

def test_Model(U, V):
    # test the precision
    k_num = len(top_k)
    # k_num-length list to record F1 and NDCG
    F1 = np.zeros(k_num)
    NDCG = np.zeros(k_num)

    # test all test samples
    user_num = 0
    for u in range(M):
        # the data in test set is [[i, i, i, i],...,[i, i]]
        test_item = Test[u]
        if len(test_item) > 0:
            user_num += 1
            # score all items
            score = np.dot(V, U[u])
            # order
            b = zip(score, range(N))
            b.sort(key=lambda x: x[0])
            order = [x[1] for x in b]
            order.reverse()
            # remove the training samples from the recommendations
            train_positive = train_data_aux[u]
            for item in train_positive:
                order.remove(item)
            # for each k, calculate top_k
            for i in range(len(top_k)):
                F1[i] += evaluation_F1(order, top_k[i], test_item)
                NDCG[i] += evaluation_NDCG(order, top_k[i], test_item)
    # calculate the average
    F1 = (F1 / user_num).tolist()
    NDCG = (NDCG / user_num).tolist()

    return F1, NDCG

def train_Model(eta):
    # training the model
    # initialization
    U = np.array([np.array([(random.random() / math.sqrt(K0)) for j in range(K0)]) for i in range(M)])
    V = np.array([np.array([(random.random() / math.sqrt(K0)) for j in range(K0)]) for i in range(N)])
    P = 5 * np.array([np.array([(random.random() / math.sqrt(K1)) for j in range(K1)]) for i in range(M)])
    Q = 5 * np.array([np.array([(-random.random() / math.sqrt(K1)) for j in range(K1)]) for i in range(N)])
    e = 10 ** 10
    # output a result without training
    print 'iteration ', 0,
    [F1, NDCG] = test_Model(U, V)
    Fmax = 0
    if F1[0] > Fmax:
        Fmax = F1[0]
    print Fmax, 'F1: ', F1, '  ', 'NDCG: ', NDCG
    # save in .xls file
    F1_df = pd.DataFrame(columns=top_k)
    NDCG_df = pd.DataFrame(columns=top_k)
    F1_df.loc[0] = F1
    NDCG_df.loc[0] = NDCG
    save_df([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False)  # @x
    # get the numer of training samples
    Re = len(train_data)
    # split the training samples with batch_size
    bs = range(0, Re, batch_size)
    bs.append(Re)
    # begin iterating
    for ep in range(epoch):
        print 'iteration ', ep + 1,
        eta = eta * 0.99
        # for each iterating, we user all training samples to train
        for ii in range(len(bs) - 1):
            if abs(U.sum()) < e:
                # input the samples as batches
                # initialize dU and dC to record the gradient
                dU = np.zeros((M, K0))
                dV = np.zeros((N, K0))
                dP = np.zeros((M, K1))
                dQ = np.zeros((N, K1))
                for re in range(bs[ii], bs[ii + 1]):
                    # iterate each batch
                    # train data, [u, i]
                    u = train_data[re][0]
                    i = train_data[re][1]
                    Ri = np.dot(U[u], V[i])
                    gammai = np.dot(P[u], Q[i])
                    # select negative samples randomly
                    num = 0
                    while num < sample_rate:
                        j = int(random.uniform(0, N))
                        # check if the current sample is positive sample
                        if not (j in train_data_aux[u]):
                            num += 1
                            Rj = np.dot(U[u], V[j])
                            gammaj = np.dot(P[u], Q[j])
                            D = d(-gammai) * d(-Ri)
                            dU[u] += D * V[i]
                            dV[i] += D * U[u]
                            D = -d(gammai) * d(Ri)
                            dP[u] += D * Q[i]
                            dQ[i] += D * P[u]

                            D = d(gammaj) * d(-Rj) - d(Rj)
                            dU[u] += D * V[j]
                            dV[j] += D * U[u]
                            D = d(-gammaj) * d(Rj)
                            dP[u] += D * Q[j]
                            dQ[j] += D * P[u]

                # update the matrices
                U += eta * (dU - lambda_r * U)
                V += eta * (dV - lambda_r * V)
                P += eta * (dP - lambda_phi * P)
                Q += eta * (dQ - lambda_phi * Q)
        if abs(U.sum()) < e:
            [F1, NDCG] = test_Model(U, V)
            if F1[0] > Fmax:
                Fmax = F1[0]
            F1_df.loc[ep + 1] = F1
            NDCG_df.loc[ep + 1] = NDCG
            print 'F1: ', F1, '  ', 'NDCG: ', NDCG
            save_df([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False)  # @x
        else:
            break
    return F1_df, NDCG_df

def print_parameter():
    # print all parameters
    print 'model:', Model
    print 'dataset:', dataset_list[dataset]
    print 'eta:', eta
    print 'lambda_r:', lambda_r, 'lambda_phi:', lambda_phi
    print 'K0:', K0, 'K1:', K1
    print 'vali_test:', ['validation', 'test'][vali_test]
    print 'sample_rate:', sample_rate
    print 'batch_size:', batch_size
    print 'epoch:', epoch
    print 'top_k:', top_k
    print

'''**************************main_function***************************'''
'''**************************main_function***************************'''

# load the data
[train_data, train_data_aux, M, N] = readdata(path_train)
validation_data = readdata(path_validation)[1]
test_data = readdata(path_test)[1]
# choose validation or test set
if vali_test == 0:
    Test = validation_data
else:
    Test = test_data

data = [
    ["Model", [Model]],
    ["dataset", [dataset_list[dataset]]],
    ["eta", [eta]],
    ["lambda_r", [lambda_r]],
    ["lambda_phi", [lambda_phi]],
    ["K0", [K0]],
    ["K1", [K1]],
    ["vali_test", [['validation', 'test'][vali_test]]],
    ["sample_rate", [sample_rate]],
    ["batch_size", [batch_size]],
    ['epoch', [epoch]],
    ["top_k", top_k]
]
path_excel = 'experiment_result\\' + dataset_list[dataset] + '_' + Model + '_' + str(int(time.time())) + str(int(random.uniform(100, 900))) + '.xlsx'
print_parameter()
save_parameters(data, path_excel)
F1_df, NDCG_df = train_Model(eta)


# ## Noise robust Bayesian Point-wise Optimization (NBPO)
# ## Wenhui Yu 2020.04.26
# ## author @Wenhui Yu, yuwh16@mails.tsinghua.edu.cn
#
# ## Parameters setting
# dataset = 0                         # datasets, 0 for Amazon, 1 for movielens
# eta = [0.05, 0.02][dataset]         # learning rate
# lambda_r = [1, 0.5][dataset]        # coefficient of regularization term
# lambda_phi = [5, 1][dataset]        # coefficient of regularization term
# K0 = 50                             # length of latent factors
# K1 = 50                             # length of latent factors for probability
# vali_test = 0                       # 0 for validation set, 1 for test set
# sample_rate = 4                     # sample rate, the number of negative samples foreach positive one
# batch_size = 5000                   # batch size
# epoch = 200                         # number of epochs
# top_k = [2, 5, 10, 20, 50, 100]     # top k items to recommend
# Model = 'NBPO'
#
# ## Paths to save and read
# # list for datasets
# dataset_list = ['amazon', 'movielens']
# path_train = 'dataset\\' + dataset_list[dataset] + '\\train_data.json'
# path_validation = 'dataset\\' + dataset_list[dataset] + '\\validation_data.json'
# path_test = 'dataset\\' + dataset_list[dataset] + '\\test_data.json'
#
# import numpy as np
# from numpy import *
# import xlwt
# import time
# from Library import readdata
# from Library import evaluation_F1
# from Library import evaluation_NDCG
# from Library import save_parameters
# from Library import save_df
#
# import xlrd, xlwt
# from xlutils.copy import copy as xl_copy
# from openpyxl import load_workbook
# from openpyxl import Workbook
# import pandas as pd
#
# def d(x):
#     # sigmoid(x)
#     if x > 10:
#         return 1
#     if x < -10:
#         return 0
#     if x >= -10 and x <= 10:
#         return 1.0 / (1.0 + exp(-x))
#
# def test_Model(U, V):
#     # test the precision
#     k_num = len(top_k)
#     # k_num-length list to record F1 and NDCG
#     F1 = np.zeros(k_num)
#     NDCG = np.zeros(k_num)
#
#     # test all test samples
#     user_num = 0
#     for u in range(M):
#         # the data in test set is [[i, i, i, i],...,[i, i]]
#         test_item = Test[u]
#         if len(test_item) > 0:
#             user_num += 1
#             # score all items
#             score = np.dot(V, U[u])
#             # order
#             b = zip(score, range(N))
#             b.sort(key=lambda x: x[0])
#             order = [x[1] for x in b]
#             order.reverse()
#             # remove the training samples from the recommendations
#             train_positive = train_data_aux[u]
#             for item in train_positive:
#                 order.remove(item)
#             # for each k, calculate top_k
#             for i in range(len(top_k)):
#                 F1[i] += evaluation_F1(order, top_k[i], test_item)
#                 NDCG[i] += evaluation_NDCG(order, top_k[i], test_item)
#     # calculate the average
#     F1 = (F1 / user_num).tolist()
#     NDCG = (NDCG / user_num).tolist()
#
#     return F1, NDCG
#
# def train_Model(eta):
#     # training the model
#     # initialization
#     U = np.array([np.array([(random.random() / math.sqrt(K0)) for j in range(K0)]) for i in range(M)])
#     V = np.array([np.array([(random.random() / math.sqrt(K0)) for j in range(K0)]) for i in range(N)])
#     P = 5 * np.array([np.array([(random.random() / math.sqrt(K1)) for j in range(K1)]) for i in range(M)])
#     Q = 5 * np.array([np.array([(-random.random() / math.sqrt(K1)) for j in range(K1)]) for i in range(N)])
#     e = 10 ** 10
#     # output a result without training
#     print 'iteration ', 0,
#     [F1, NDCG] = test_Model(U, V)
#     Fmax = 0
#     if F1[0] > Fmax:
#         Fmax = F1[0]
#     print Fmax, 'F1: ', F1, '  ', 'NDCG: ', NDCG
#     # save in .xls file
#     F1_df = pd.DataFrame(columns=top_k)
#     NDCG_df = pd.DataFrame(columns=top_k)
#     F1_df.loc[0] = F1
#     NDCG_df.loc[0] = NDCG
#     save_df([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False)  # @x
#     # get the numer of training samples
#     Re = len(train_data)
#     # split the training samples with batch_size
#     bs = range(0, Re, batch_size)
#     bs.append(Re)
#     # begin iterating
#     for ep in range(epoch):
#         print 'iteration ', ep + 1,
#         eta = eta * 0.99
#         # for each iterating, we user all training samples to train
#         for ii in range(len(bs) - 1):
#             if abs(U.sum()) < e:
#                 # input the samples as batches
#                 # initialize dU and dC to record the gradient
#                 dU = np.zeros((M, K0))
#                 dV = np.zeros((N, K0))
#                 dP = np.zeros((M, K1))
#                 dQ = np.zeros((N, K1))
#                 for re in range(bs[ii], bs[ii + 1]):
#                     # iterate each batch
#                     # train data, [u, i]
#                     u = train_data[re][0]
#                     i = train_data[re][1]
#                     Ri = d(np.dot(U[u], V[i]))
#                     gammai = d(np.dot(P[u], Q[i]))
#                     # select negative samples randomly
#                     num = 0
#                     while num < sample_rate:
#                         j = int(random.uniform(0, N))
#                         # check if the current sample is positive sample
#                         if not (j in train_data_aux[u]):
#                             num += 1
#                             Rj = d(np.dot(U[u], V[j]))
#                             gammaj = d(np.dot(P[u], Q[j]))
#                             D = (1 - Ri) * (1 - gammai)
#                             dU[u] += D * V[i]
#                             dV[i] += D * U[u]
#                             D = -Ri * gammai
#                             dP[u] += D * Q[i]
#                             dQ[i] += D * P[u]
#
#                             D = gammaj * (1 - Rj) - Rj
#                             dU[u] += D * V[j]
#                             dV[j] += D * U[u]
#                             D = (1 - gammaj) * Rj
#                             dP[u] += D * Q[j]
#                             dQ[j] += D * P[u]
#
#                 # update the matrices
#                 U += eta * (dU - lambda_r * U)
#                 V += eta * (dV - lambda_r * V)
#                 P += eta * (dP - lambda_phi * P)
#                 Q += eta * (dQ - lambda_phi * Q)
#         if abs(U.sum()) < e:
#             [F1, NDCG] = test_Model(U, V)
#             if F1[0] > Fmax:
#                 Fmax = F1[0]
#             F1_df.loc[ep + 1] = F1
#             NDCG_df.loc[ep + 1] = NDCG
#             print 'F1: ', F1, '  ', 'NDCG: ', NDCG
#             save_df([[F1_df, 'F1'], [NDCG_df, 'NDCG']], path_excel, first_sheet=False)  # @x
#         else:
#             break
#     return F1_df, NDCG_df
#
# def print_parameter():
#     # print all parameters
#     print 'model:', Model
#     print 'dataset:', dataset_list[dataset]
#     print 'eta:', eta
#     print 'lambda_r:', lambda_r, 'lambda_phi:', lambda_phi
#     print 'K0:', K0, 'K1:', K1
#     print 'vali_test:', ['validation', 'test'][vali_test]
#     print 'sample_rate:', sample_rate
#     print 'batch_size:', batch_size
#     print 'epoch:', epoch
#     print 'top_k:', top_k
#     print
#
# '''**************************main_function***************************'''
# '''**************************main_function***************************'''
#
# # load the data
# [train_data, train_data_aux, M, N] = readdata(path_train)
# validation_data = readdata(path_validation)[1]
# test_data = readdata(path_test)[1]
# # choose validation or test set
# if vali_test == 0:
#     Test = validation_data
# else:
#     Test = test_data
#
# data = [
#     ["Model", [Model]],
#     ["dataset", [dataset_list[dataset]]],
#     ["eta", [eta]],
#     ["lambda_r", [lambda_r]],
#     ["lambda_phi", [lambda_phi]],
#     ["K0", [K0]],
#     ["K1", [K1]],
#     ["vali_test", [['validation', 'test'][vali_test]]],
#     ["sample_rate", [sample_rate]],
#     ["batch_size", [batch_size]],
#     ['epoch', [epoch]],
#     ["top_k", top_k]
# ]
# path_excel = 'experiment_result\\' + dataset_list[dataset] + '_' + Model + '_' + str(int(time.time())) + str(int(random.uniform(100, 900))) + '.xlsx'
# print_parameter()
# save_parameters(data, path_excel)
# F1_df, NDCG_df = train_Model(eta)