From 0167ac82f12471da6898e9128d430b7af4be4d5a Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Thu, 8 Jul 2021 03:14:31 +0000 Subject: [PATCH 01/12] FEA: add config 'augmentation' --- recbole/config/configurator.py | 11 ++++++++++- recbole/data/dataset/sequential_dataset.py | 9 ++++++--- recbole/properties/dataset/sample.yaml | 1 + recbole/properties/quick_start_config/sequential.yaml | 2 +- .../properties/quick_start_config/sequential_DIN.yaml | 3 ++- recbole/utils/argument_list.py | 1 + 6 files changed, 21 insertions(+), 6 deletions(-) diff --git a/recbole/config/configurator.py b/recbole/config/configurator.py index f2e15b21d..70b84b0b2 100644 --- a/recbole/config/configurator.py +++ b/recbole/config/configurator.py @@ -3,7 +3,7 @@ # @Email : linzihan.super@foxmail.com # UPDATE -# @Time : 2020/10/04, 2021/3/2, 2021/2/17, 2021/6/30 +# @Time : 2020/10/04, 2021/7/8, 2021/2/17, 2021/6/30 # @Author : Shanlei Mu, Yupeng Hou, Jiawei Guan, Xingyu Pan # @Email : slmu@ruc.edu.cn, houyupeng@ruc.edu.cn, Guanjw@ruc.edu.cn, xy_pan@foxmail.com @@ -284,6 +284,15 @@ def _set_default_parameters(self): else: raise ValueError('Either Model has attr \'input_type\',' 'or arg \'loss_type\' should exist in config.') + if self.final_config_dict['MODEL_TYPE'] == ModelType.SEQUENTIAL and \ + self.final_config_dict['benchmark_filename'] is not None and \ + self.final_config_dict['augmentation']: + raise ValueError( + f'Benchmark datasets for sequential model {self.model} ' + f'should be augmented in advance, which should not be augmented again and ' + f'config \'augmentation\' should be False.' + ) + eval_type = None for metric in self.final_config_dict['metrics']: if metric.lower() in individual_metrics: diff --git a/recbole/data/dataset/sequential_dataset.py b/recbole/data/dataset/sequential_dataset.py index 706e7d6f6..f56901612 100644 --- a/recbole/data/dataset/sequential_dataset.py +++ b/recbole/data/dataset/sequential_dataset.py @@ -3,9 +3,9 @@ # @Email : chenyushuo@ruc.edu.cn # UPDATE: -# @Time : 2020/9/16, 2021/7/1 -# @Author : Yushuo Chen, Xingyu Pan -# @Email : chenyushuo@ruc.edu.cn, xy_pan@foxmail.com +# @Time : 2020/9/16, 2021/7/1, 2021/7/8 +# @Author : Yushuo Chen, Xingyu Pan, Yupeng Hou +# @Email : chenyushuo@ruc.edu.cn, xy_pan@foxmail.com, houyupeng@ruc.edu.cn """ recbole.data.sequential_dataset @@ -25,6 +25,8 @@ class SequentialDataset(Dataset): which can accelerate the data loader. Attributes: + augmentation (bool): Whether the interactions should be augmented in RecBole. + uid_list (numpy.ndarray): List of user id after augmentation. item_list_index (numpy.ndarray): List of indexes of item sequence after augmentation. @@ -36,6 +38,7 @@ class SequentialDataset(Dataset): """ def __init__(self, config): + self.augmentation = config['augmentation'] super().__init__(config) def prepare_data_augmentation(self): diff --git a/recbole/properties/dataset/sample.yaml b/recbole/properties/dataset/sample.yaml index d9869e5e2..ff4ac4915 100644 --- a/recbole/properties/dataset/sample.yaml +++ b/recbole/properties/dataset/sample.yaml @@ -43,6 +43,7 @@ normalize_field: ~ normalize_all: ~ # Sequential Model Needed +augmentation: False ITEM_LIST_LENGTH_FIELD: item_length LIST_SUFFIX: _list MAX_ITEM_LIST_LENGTH: 50 diff --git a/recbole/properties/quick_start_config/sequential.yaml b/recbole/properties/quick_start_config/sequential.yaml index 04e591aa2..2370f18b4 100644 --- a/recbole/properties/quick_start_config/sequential.yaml +++ b/recbole/properties/quick_start_config/sequential.yaml @@ -2,4 +2,4 @@ eval_args: split: {'LS': 2} order: TO mode: full - +augmentation: True diff --git a/recbole/properties/quick_start_config/sequential_DIN.yaml b/recbole/properties/quick_start_config/sequential_DIN.yaml index 398e2ff43..3e7f4a681 100644 --- a/recbole/properties/quick_start_config/sequential_DIN.yaml +++ b/recbole/properties/quick_start_config/sequential_DIN.yaml @@ -3,4 +3,5 @@ eval_args: order: TO mode: uni100 metrics: ['AUC', 'LogLoss'] -valid_metric: AUC \ No newline at end of file +valid_metric: AUC +augmentation: True \ No newline at end of file diff --git a/recbole/utils/argument_list.py b/recbole/utils/argument_list.py index 632f442bd..31b719989 100644 --- a/recbole/utils/argument_list.py +++ b/recbole/utils/argument_list.py @@ -45,6 +45,7 @@ 'ITEM_LIST_LENGTH_FIELD', 'LIST_SUFFIX', 'MAX_ITEM_LIST_LENGTH', 'POSITION_FIELD', 'HEAD_ENTITY_ID_FIELD', 'TAIL_ENTITY_ID_FIELD', 'RELATION_ID_FIELD', 'ENTITY_ID_FIELD', 'load_col', 'unload_col', 'unused_col', 'additional_feat_suffix', + 'augmentation', 'max_user_inter_num', 'min_user_inter_num', 'max_item_inter_num', 'min_item_inter_num', 'lowest_val', 'highest_val', 'equal_val', 'not_equal_val', 'fields_in_same_space', From 4fc613bc7af01c1110cded5e4153339783110f31 Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Thu, 8 Jul 2021 13:11:31 +0000 Subject: [PATCH 02/12] REFACTOR: real augmentation for seq rec --- .../data/dataloader/sequential_dataloader.py | 118 ++------------ recbole/data/dataset/sequential_dataset.py | 147 ++++++++++-------- 2 files changed, 92 insertions(+), 173 deletions(-) diff --git a/recbole/data/dataloader/sequential_dataloader.py b/recbole/data/dataloader/sequential_dataloader.py index ad962a329..258a1814f 100644 --- a/recbole/data/dataloader/sequential_dataloader.py +++ b/recbole/data/dataloader/sequential_dataloader.py @@ -3,7 +3,7 @@ # @Email : houyupeng@ruc.edu.cn # UPDATE -# @Time : 2020/10/6, 2020/9/17 +# @Time : 2021/7/8, 2020/9/17 # @Author : Yupeng Hou, Yushuo Chen # @email : houyupeng@ruc.edu.cn, chenyushuo@ruc.edu.cn @@ -15,15 +15,15 @@ import numpy as np import torch -from recbole.data.dataloader.abstract_dataloader import AbstractDataLoader +from recbole.data.dataloader.general_dataloader import GeneralDataLoader from recbole.data.dataloader.neg_sample_mixin import NegSampleByMixin, NegSampleMixin from recbole.data.interaction import Interaction, cat_interactions -from recbole.utils import DataLoaderType, FeatureSource, FeatureType, InputType +from recbole.utils import DataLoaderType, InputType -class SequentialDataLoader(AbstractDataLoader): - """:class:`SequentialDataLoader` is used for sequential model. It will do data augmentation for the origin data. - And its returned data contains the following: +class SequentialDataLoader(GeneralDataLoader): + """:class:`SequentialDataLoader` is used for sequential model. + It contains the following: - user id - history items list @@ -41,109 +41,7 @@ class SequentialDataLoader(AbstractDataLoader): :obj:`~recbole.utils.enum_type.InputType.POINTWISE`. shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``. """ - dl_type = DataLoaderType.ORIGIN - - def __init__(self, config, dataset, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False): - self.uid_field = dataset.uid_field - self.iid_field = dataset.iid_field - self.time_field = dataset.time_field - self.max_item_list_len = config['MAX_ITEM_LIST_LENGTH'] - - list_suffix = config['LIST_SUFFIX'] - for field in dataset.inter_feat: - if field != self.uid_field: - list_field = field + list_suffix - setattr(self, f'{field}_list_field', list_field) - ftype = dataset.field2type[field] - - if ftype in [FeatureType.TOKEN, FeatureType.TOKEN_SEQ]: - list_ftype = FeatureType.TOKEN_SEQ - else: - list_ftype = FeatureType.FLOAT_SEQ - - if ftype in [FeatureType.TOKEN_SEQ, FeatureType.FLOAT_SEQ]: - list_len = (self.max_item_list_len, dataset.field2seqlen[field]) - else: - list_len = self.max_item_list_len - - dataset.set_field_property(list_field, list_ftype, FeatureSource.INTERACTION, list_len) - - self.item_list_length_field = config['ITEM_LIST_LENGTH_FIELD'] - dataset.set_field_property(self.item_list_length_field, FeatureType.TOKEN, FeatureSource.INTERACTION, 1) - - self.uid_list = dataset.uid_list - self.item_list_index = dataset.item_list_index - self.target_index = dataset.target_index - self.item_list_length = dataset.item_list_length - self.pre_processed_data = None - - super().__init__(config, dataset, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle) - - def data_preprocess(self): - """Do data augmentation before training/evaluation. - """ - self.pre_processed_data = self.augmentation(self.item_list_index, self.target_index, self.item_list_length) - - @property - def pr_end(self): - return len(self.uid_list) - - def _shuffle(self): - if self.real_time: - new_index = torch.randperm(self.pr_end) - self.uid_list = self.uid_list[new_index] - self.item_list_index = self.item_list_index[new_index] - self.target_index = self.target_index[new_index] - self.item_list_length = self.item_list_length[new_index] - else: - self.pre_processed_data.shuffle() - - def _next_batch_data(self): - cur_data = self._get_processed_data(slice(self.pr, self.pr + self.step)) - self.pr += self.step - return cur_data - - def _get_processed_data(self, index): - if self.real_time: - cur_data = self.augmentation( - self.item_list_index[index], self.target_index[index], self.item_list_length[index] - ) - else: - cur_data = self.pre_processed_data[index] - return cur_data - - def augmentation(self, item_list_index, target_index, item_list_length): - """Data augmentation. - - Args: - item_list_index (numpy.ndarray): the index of history items list in interaction. - target_index (numpy.ndarray): the index of items to be predicted in interaction. - item_list_length (numpy.ndarray): history list length. - - Returns: - dict: the augmented data. - """ - new_length = len(item_list_index) - new_data = self.dataset.inter_feat[target_index] - new_dict = { - self.item_list_length_field: torch.tensor(item_list_length), - } - - for field in self.dataset.inter_feat: - if field != self.uid_field: - list_field = getattr(self, f'{field}_list_field') - list_len = self.dataset.field2seqlen[list_field] - shape = (new_length, list_len) if isinstance(list_len, int) else (new_length,) + list_len - list_ftype = self.dataset.field2type[list_field] - dtype = torch.int64 if list_ftype in [FeatureType.TOKEN, FeatureType.TOKEN_SEQ] else torch.float64 - new_dict[list_field] = torch.zeros(shape, dtype=dtype) - - value = self.dataset.inter_feat[field] - for i, (index, length) in enumerate(zip(item_list_index, item_list_length)): - new_dict[list_field][i][:length] = value[index] - - new_data.update(Interaction(new_dict)) - return new_data + pass class SequentialNegSampleDataLoader(NegSampleByMixin, SequentialDataLoader): @@ -167,6 +65,7 @@ class SequentialNegSampleDataLoader(NegSampleByMixin, SequentialDataLoader): def __init__( self, config, dataset, sampler, neg_sample_args, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False ): + self.iid_field = dataset.iid_field super().__init__( config, dataset, sampler, neg_sample_args, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle ) @@ -253,6 +152,7 @@ class SequentialFullDataLoader(NegSampleMixin, SequentialDataLoader): def __init__( self, config, dataset, sampler, neg_sample_args, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False ): + self.iid_field = dataset.iid_field super().__init__( config, dataset, sampler, neg_sample_args, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle ) diff --git a/recbole/data/dataset/sequential_dataset.py b/recbole/data/dataset/sequential_dataset.py index f56901612..5a66ea919 100644 --- a/recbole/data/dataset/sequential_dataset.py +++ b/recbole/data/dataset/sequential_dataset.py @@ -12,11 +12,12 @@ ############################### """ -import copy - import numpy as np +import torch from recbole.data.dataset import Dataset +from recbole.data.interaction import Interaction +from recbole.utils.enum_type import FeatureType, FeatureSource class SequentialDataset(Dataset): @@ -39,9 +40,44 @@ class SequentialDataset(Dataset): def __init__(self, config): self.augmentation = config['augmentation'] + self.max_item_list_len = config['MAX_ITEM_LIST_LENGTH'] + self.item_list_length_field = config['ITEM_LIST_LENGTH_FIELD'] super().__init__(config) - def prepare_data_augmentation(self): + def _change_feat_format(self): + """Change feat format from :class:`pandas.DataFrame` to :class:`Interaction`, + then perform data augmentation. + """ + super()._change_feat_format() + + if not self.augmentation: + return + self.logger.debug('Augmentation for sequential recommendation.') + self.data_augmentation() + + def _aug_presets(self): + list_suffix = self.config['LIST_SUFFIX'] + for field in self.inter_feat: + if field != self.uid_field: + list_field = field + list_suffix + setattr(self, f'{field}_list_field', list_field) + ftype = self.field2type[field] + + if ftype in [FeatureType.TOKEN, FeatureType.TOKEN_SEQ]: + list_ftype = FeatureType.TOKEN_SEQ + else: + list_ftype = FeatureType.FLOAT_SEQ + + if ftype in [FeatureType.TOKEN_SEQ, FeatureType.FLOAT_SEQ]: + list_len = (self.max_item_list_len, self.field2seqlen[field]) + else: + list_len = self.max_item_list_len + + self.set_field_property(list_field, list_ftype, FeatureSource.INTERACTION, list_len) + + self.set_field_property(self.item_list_length_field, FeatureType.TOKEN, FeatureSource.INTERACTION, 1) + + def data_augmentation(self): """Augmentation processing for sequential dataset. E.g., ``u1`` has purchase sequence ````, @@ -57,14 +93,10 @@ def prepare_data_augmentation(self): ``u1, | i3`` ``u1, | i4`` - - Note: - Actually, we do not really generate these new item sequences. - One user's item sequence is stored only once in memory. - We store the index (slice) of each item sequence after augmentation, - which saves memory and accelerates a lot. """ - self.logger.debug('prepare_data_augmentation') + self.logger.debug('data_augmentation') + + self._aug_presets() self._check_field('uid_field', 'time_field') max_item_list_len = self.config['MAX_ITEM_LIST_LENGTH'] @@ -84,34 +116,32 @@ def prepare_data_augmentation(self): target_index.append(i) item_list_length.append(i - seq_start) - self.uid_list = np.array(uid_list) - self.item_list_index = np.array(item_list_index) - self.target_index = np.array(target_index) - self.item_list_length = np.array(item_list_length, dtype=np.int64) - self.mask = np.ones(len(self.inter_feat), dtype=np.bool) - - def leave_one_out(self, group_by, leave_one_num=1): - self.logger.debug(f'Leave one out, group_by=[{group_by}], leave_one_num=[{leave_one_num}].') - if group_by is None: - raise ValueError('Leave one out strategy require a group field.') - if group_by != self.uid_field: - raise ValueError('Sequential models require group by user.') - - self.prepare_data_augmentation() - grouped_index = self._grouped_index(self.uid_list) - next_index = self._split_index_by_leave_one_out(grouped_index, leave_one_num) - - self._drop_unused_col() - next_ds = [] - for index in next_index: - ds = copy.copy(self) - for field in ['uid_list', 'item_list_index', 'target_index', 'item_list_length']: - setattr(ds, field, np.array(getattr(ds, field)[index])) - setattr(ds, 'mask', np.ones(len(self.inter_feat), dtype=np.bool)) - next_ds.append(ds) - next_ds[0].mask[self.target_index[next_index[1] + next_index[2]]] = False - next_ds[1].mask[self.target_index[next_index[2]]] = False - return next_ds + uid_list = np.array(uid_list) + item_list_index = np.array(item_list_index) + target_index = np.array(target_index) + item_list_length = np.array(item_list_length, dtype=np.int64) + + new_length = len(item_list_index) + new_data = self.inter_feat[target_index] + new_dict = { + self.item_list_length_field: torch.tensor(item_list_length), + } + + for field in self.inter_feat: + if field != self.uid_field: + list_field = getattr(self, f'{field}_list_field') + list_len = self.field2seqlen[list_field] + shape = (new_length, list_len) if isinstance(list_len, int) else (new_length,) + list_len + list_ftype = self.field2type[list_field] + dtype = torch.int64 if list_ftype in [FeatureType.TOKEN, FeatureType.TOKEN_SEQ] else torch.float64 + new_dict[list_field] = torch.zeros(shape, dtype=dtype) + + value = self.inter_feat[field] + for i, (index, length) in enumerate(zip(item_list_index, item_list_length)): + new_dict[list_field][i][:length] = value[index] + + new_data.update(Interaction(new_dict)) + self.inter_feat = new_data def inter_matrix(self, form='coo', value_field=None): """Get sparse matrix that describe interactions between user_id and item_id. @@ -129,35 +159,24 @@ def inter_matrix(self, form='coo', value_field=None): """ if not self.uid_field or not self.iid_field: raise ValueError('dataset does not exist uid/iid, thus can not converted to sparse matrix.') - - self.logger.warning( - 'Load interaction matrix may lead to label leakage from testing phase, this implementation ' - 'only provides the interactions corresponding to specific phase' - ) - local_inter_feat = self.inter_feat[self.mask] # TODO: self.mask will applied to _history_matrix() in future + local_inter_feat = self.inter_feat + # TODO add items in the session of length 1 + raise NotImplementedError() return self._create_sparse_matrix(local_inter_feat, self.uid_field, self.iid_field, form, value_field) def build(self): + """Processing dataset according to evaluation setting, including Group, Order and Split. + See :class:`~recbole.config.eval_setting.EvalSetting` for details. - self._change_feat_format() + Args: + eval_setting (:class:`~recbole.config.eval_setting.EvalSetting`): + Object contains evaluation settings, which guide the data processing procedure. - + Returns: + list: List of built :class:`Dataset`. + """ ordering_args = self.config['eval_args']['order'] - if ordering_args == 'RO': - raise ValueError('Ordering strategy `shuffle` is not supported in sequential models.') - - group_by = self.config['eval_args']['group_by'] - if group_by != 'user': - raise ValueError('The data splitting for Sequential models must be grouped by user.') - - split_args = self.config['eval_args']['split'] - if split_args is None: - raise ValueError('The split_args in eval_args should not be None.') - if isinstance(split_args, dict) != True: - raise ValueError(f'The split_args [{split_args}] should be a dict.') - - split_mode = list(split_args.keys())[0] - if split_mode == 'LS': - return self.leave_one_out(group_by=self.config['USER_ID_FIELD'], leave_one_num=split_args['LS']) - else: - ValueError('Sequential models require `LS` (leave one out) split strategy.') + if ordering_args != 'TO': + raise ValueError(f'The ordering args for sequential recommendation has to be \'TO\'') + + return super().build() From 563043ca4b1e154c33566487300ea857ae4c4340 Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Fri, 9 Jul 2021 02:17:16 +0000 Subject: [PATCH 03/12] FIX: bugs in negative sample seq dataloader --- recbole/data/dataloader/sequential_dataloader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/recbole/data/dataloader/sequential_dataloader.py b/recbole/data/dataloader/sequential_dataloader.py index 258a1814f..ab77fd9c7 100644 --- a/recbole/data/dataloader/sequential_dataloader.py +++ b/recbole/data/dataloader/sequential_dataloader.py @@ -65,7 +65,9 @@ class SequentialNegSampleDataLoader(NegSampleByMixin, SequentialDataLoader): def __init__( self, config, dataset, sampler, neg_sample_args, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False ): + self.uid_field = dataset.uid_field self.iid_field = dataset.iid_field + self.label_field = dataset.label_field super().__init__( config, dataset, sampler, neg_sample_args, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle ) @@ -77,7 +79,7 @@ def _batch_size_adaptation(self): self.upgrade_batch_size(new_batch_size) def _next_batch_data(self): - cur_data = self._get_processed_data(slice(self.pr, self.pr + self.step)) + cur_data = self.dataset[self.pr:self.pr + self.step] cur_data = self._neg_sampling(cur_data) self.pr += self.step From 09e967a6d5f3014a4ce1019645a6390914088163 Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Fri, 9 Jul 2021 02:17:39 +0000 Subject: [PATCH 04/12] FIX: comments for seq dataset --- recbole/data/dataset/sequential_dataset.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/recbole/data/dataset/sequential_dataset.py b/recbole/data/dataset/sequential_dataset.py index 5a66ea919..4d3e6a7c2 100644 --- a/recbole/data/dataset/sequential_dataset.py +++ b/recbole/data/dataset/sequential_dataset.py @@ -3,7 +3,7 @@ # @Email : chenyushuo@ruc.edu.cn # UPDATE: -# @Time : 2020/9/16, 2021/7/1, 2021/7/8 +# @Time : 2020/9/16, 2021/7/1, 2021/7/9 # @Author : Yushuo Chen, Xingyu Pan, Yupeng Hou # @Email : chenyushuo@ruc.edu.cn, xy_pan@foxmail.com, houyupeng@ruc.edu.cn @@ -27,15 +27,8 @@ class SequentialDataset(Dataset): Attributes: augmentation (bool): Whether the interactions should be augmented in RecBole. - - uid_list (numpy.ndarray): List of user id after augmentation. - - item_list_index (numpy.ndarray): List of indexes of item sequence after augmentation. - - target_index (numpy.ndarray): List of indexes of target item id after augmentation. - - item_list_length (numpy.ndarray): List of item sequences' length after augmentation. - + max_item_list_len (int): Max length of historical item list. + item_list_length_field (str): Field name for item lists' length. """ def __init__(self, config): From d097d4e85a06a16c08a1f29753689037061dcd65 Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Fri, 9 Jul 2021 02:49:14 +0000 Subject: [PATCH 05/12] REFACTOR: DIEN & DIN's dataset & dataloader --- recbole/data/dataloader/__init__.py | 1 - recbole/data/dataloader/dien_dataloader.py | 146 --------------------- recbole/data/dataset/customized_dataset.py | 110 +++++++++++++++- recbole/data/utils.py | 44 +------ 4 files changed, 110 insertions(+), 191 deletions(-) delete mode 100644 recbole/data/dataloader/dien_dataloader.py diff --git a/recbole/data/dataloader/__init__.py b/recbole/data/dataloader/__init__.py index dce69e125..b8b2e911f 100644 --- a/recbole/data/dataloader/__init__.py +++ b/recbole/data/dataloader/__init__.py @@ -3,7 +3,6 @@ from recbole.data.dataloader.general_dataloader import * from recbole.data.dataloader.context_dataloader import * from recbole.data.dataloader.sequential_dataloader import * -from recbole.data.dataloader.dien_dataloader import * from recbole.data.dataloader.knowledge_dataloader import * from recbole.data.dataloader.decisiontree_dataloader import * from recbole.data.dataloader.user_dataloader import * diff --git a/recbole/data/dataloader/dien_dataloader.py b/recbole/data/dataloader/dien_dataloader.py deleted file mode 100644 index 6fb06b321..000000000 --- a/recbole/data/dataloader/dien_dataloader.py +++ /dev/null @@ -1,146 +0,0 @@ -# @Time : 2021/2/25 -# @Author : Zhichao Feng -# @Email : fzcbupt@gmail.com - -# UPDATE -# @Time : 2021/3/19 -# @Author : Zhichao Feng -# @email : fzcbupt@gmail.com - -""" -recbole.data.dataloader.dien_dataloader -################################################ -""" - -import torch - -from recbole.data.dataloader.sequential_dataloader import SequentialDataLoader, SequentialNegSampleDataLoader, SequentialFullDataLoader -from recbole.data.interaction import Interaction, cat_interactions -from recbole.utils import DataLoaderType, FeatureSource, FeatureType, InputType -from recbole.sampler import SeqSampler - - -class DIENDataLoader(SequentialDataLoader): - """:class:`DIENDataLoader` is used for DIEN model. It is different from :class:`SequentialDataLoader` in - `augmentation`. It add users' negative item list to interaction. - It will do data augmentation for the origin data. And its returned data contains the following: - - - user id - - history items list - - history negative item list - - history items' interaction time list - - item to be predicted - - the interaction time of item to be predicted - - history list length - - other interaction information of item to be predicted - - Args: - config (Config): The config of dataloader. - dataset (Dataset): The dataset of dataloader. - batch_size (int, optional): The batch_size of dataloader. Defaults to ``1``. - dl_format (InputType, optional): The input type of dataloader. Defaults to - :obj:`~recbole.utils.enum_type.InputType.POINTWISE`. - shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``. - """ - dl_type = DataLoaderType.ORIGIN - - def __init__(self, config, dataset, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False): - - list_suffix = config['LIST_SUFFIX'] - neg_prefix = config['NEG_PREFIX'] - - self.seq_sampler = SeqSampler(dataset) - self.iid_field = dataset.iid_field - self.neg_item_list_field = neg_prefix + self.iid_field + list_suffix - self.neg_item_list = self.seq_sampler.sample_neg_sequence(dataset.inter_feat[self.iid_field]) - - super().__init__(config, dataset, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle) - - def augmentation(self, item_list_index, target_index, item_list_length): - """Data augmentation. - - Args: - item_list_index (numpy.ndarray): the index of history items list in interaction. - target_index (numpy.ndarray): the index of items to be predicted in interaction. - item_list_length (numpy.ndarray): history list length. - - Returns: - dict: the augmented data. - """ - new_length = len(item_list_index) - new_data = self.dataset.inter_feat[target_index] - new_dict = { - self.item_list_length_field: torch.tensor(item_list_length), - } - - for field in self.dataset.inter_feat: - if field != self.uid_field: - list_field = getattr(self, f'{field}_list_field') - list_len = self.dataset.field2seqlen[list_field] - shape = (new_length, list_len) if isinstance(list_len, int) else (new_length,) + list_len - list_ftype = self.dataset.field2type[list_field] - dtype = torch.int64 if list_ftype in [FeatureType.TOKEN, FeatureType.TOKEN_SEQ] else torch.float64 - new_dict[list_field] = torch.zeros(shape, dtype=dtype) - - value = self.dataset.inter_feat[field] - for i, (index, length) in enumerate(zip(item_list_index, item_list_length)): - new_dict[list_field][i][:length] = value[index] - - if field == self.iid_field: - new_dict[self.neg_item_list_field] = torch.zeros(shape, dtype=dtype) - for i, (index, length) in enumerate(zip(item_list_index, item_list_length)): - new_dict[self.neg_item_list_field][i][:length] = self.neg_item_list[index] - - new_data.update(Interaction(new_dict)) - return new_data - - -class DIENNegSampleDataLoader(SequentialNegSampleDataLoader, DIENDataLoader): - """:class:`DIENNegSampleDataLoader` is sequential-dataloader with negative sampling for DIEN. - Like :class:`~recbole.data.dataloader.general_dataloader.GeneralNegSampleDataLoader`, for the result of every batch, - we permit that every positive interaction and its negative interaction must be in the same batch. Beside this, - when it is in the evaluation stage, and evaluator is topk-like function, we also permit that all the interactions - corresponding to each user are in the same batch and positive interactions are before negative interactions. - - Args: - config (Config): The config of dataloader. - dataset (Dataset): The dataset of dataloader. - sampler (Sampler): The sampler of dataloader. - neg_sample_args (dict): The neg_sample_args of dataloader. - batch_size (int, optional): The batch_size of dataloader. Defaults to ``1``. - dl_format (InputType, optional): The input type of dataloader. Defaults to - :obj:`~recbole.utils.enum_type.InputType.POINTWISE`. - shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``. - """ - - def __init__( - self, config, dataset, sampler, neg_sample_args, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False - ): - super().__init__( - config, dataset, sampler, neg_sample_args, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle - ) - - -class DIENFullDataLoader(SequentialFullDataLoader, DIENDataLoader): - """:class:`DIENFullDataLoader` is a sequential-dataloader with full sort for DIEN. In order to speed up calculation, - this dataloader would only return then user part of interactions, positive items and used items. - It would not return negative items. - - Args: - config (Config): The config of dataloader. - dataset (Dataset): The dataset of dataloader. - sampler (Sampler): The sampler of dataloader. - neg_sample_args (dict): The neg_sample_args of dataloader. - batch_size (int, optional): The batch_size of dataloader. Defaults to ``1``. - dl_format (InputType, optional): The input type of dataloader. Defaults to - :obj:`~recbole.utils.enum_type.InputType.POINTWISE`. - shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``. - """ - dl_type = DataLoaderType.FULL - - def __init__( - self, config, dataset, sampler, neg_sample_args, batch_size=1, dl_format=InputType.POINTWISE, shuffle=False - ): - super().__init__( - config, dataset, sampler, neg_sample_args, batch_size=batch_size, dl_format=dl_format, shuffle=shuffle - ) diff --git a/recbole/data/dataset/customized_dataset.py b/recbole/data/dataset/customized_dataset.py index d1e34bab2..3dd7f0546 100644 --- a/recbole/data/dataset/customized_dataset.py +++ b/recbole/data/dataset/customized_dataset.py @@ -2,6 +2,11 @@ # @Author : Yupeng Hou # @Email : houyupeng@ruc.edu.cn +# UPDATE +# @Time : 2021/7/9 +# @Author : Yupeng Hou +# @Email : houyupeng@ruc.edu.cn + """ recbole.data.customized_dataset ################################## @@ -11,7 +16,13 @@ Customized datasets named ``[Model Name]Dataset`` can be automatically called. """ -from recbole.data.dataset import Kg_Seq_Dataset +import numpy as np +import torch + +from recbole.data.dataset import Kg_Seq_Dataset, SequentialDataset +from recbole.data.interaction import Interaction +from recbole.sampler import SeqSampler +from recbole.utils.enum_type import FeatureType class GRU4RecKGDataset(Kg_Seq_Dataset): @@ -24,3 +35,100 @@ class KSRDataset(Kg_Seq_Dataset): def __init__(self, config): super().__init__(config) + + +class DIENDataset(SequentialDataset): + """:class:`DIENDataset` is based on :class:`~recbole.data.dataset.sequential_dataset.SequentialDataset`. + It is different from :class:`SequentialDataset` in `data_augmentation`. + It add users' negative item list to interaction. + + The original version of sampling negative item list is implemented by Zhichao Feng (fzcbupt@gmail.com) in 2021/2/25, + and he updated the codes in 2021/3/19. In 2021/7/9, Yupeng refactored SequentialDataset & SequentialDataLoader, + then refactored DIENDataset, either. + + Attributes: + augmentation (bool): Whether the interactions should be augmented in RecBole. + seq_sample (recbole.sampler.SeqSampler): A sampler used to sample negative item sequence. + neg_item_list_field (str): Field name for negative item sequence. + neg_item_list (torch.tensor): all users' negative item history sequence. + """ + def __init__(self, config): + super().__init__(config) + + list_suffix = config['LIST_SUFFIX'] + neg_prefix = config['NEG_PREFIX'] + self.seq_sampler = SeqSampler(self) + self.neg_item_list_field = neg_prefix + self.iid_field + list_suffix + self.neg_item_list = self.seq_sampler.sample_neg_sequence(self.inter_feat[self.iid_field]) + + def data_augmentation(self): + """Augmentation processing for sequential dataset. + + E.g., ``u1`` has purchase sequence ````, + then after augmentation, we will generate three cases. + + ``u1, | i2`` + + (Which means given user_id ``u1`` and item_seq ````, + we need to predict the next item ``i2``.) + + The other cases are below: + + ``u1, | i3`` + + ``u1, | i4`` + """ + self.logger.debug('data_augmentation') + + self._aug_presets() + + self._check_field('uid_field', 'time_field') + max_item_list_len = self.config['MAX_ITEM_LIST_LENGTH'] + self.sort(by=[self.uid_field, self.time_field], ascending=True) + last_uid = None + uid_list, item_list_index, target_index, item_list_length = [], [], [], [] + seq_start = 0 + for i, uid in enumerate(self.inter_feat[self.uid_field].numpy()): + if last_uid != uid: + last_uid = uid + seq_start = i + else: + if i - seq_start > max_item_list_len: + seq_start += 1 + uid_list.append(uid) + item_list_index.append(slice(seq_start, i)) + target_index.append(i) + item_list_length.append(i - seq_start) + + uid_list = np.array(uid_list) + item_list_index = np.array(item_list_index) + target_index = np.array(target_index) + item_list_length = np.array(item_list_length, dtype=np.int64) + + new_length = len(item_list_index) + new_data = self.inter_feat[target_index] + new_dict = { + self.item_list_length_field: torch.tensor(item_list_length), + } + + for field in self.inter_feat: + if field != self.uid_field: + list_field = getattr(self, f'{field}_list_field') + list_len = self.field2seqlen[list_field] + shape = (new_length, list_len) if isinstance(list_len, int) else (new_length,) + list_len + list_ftype = self.field2type[list_field] + dtype = torch.int64 if list_ftype in [FeatureType.TOKEN, FeatureType.TOKEN_SEQ] else torch.float64 + new_dict[list_field] = torch.zeros(shape, dtype=dtype) + + value = self.inter_feat[field] + for i, (index, length) in enumerate(zip(item_list_index, item_list_length)): + new_dict[list_field][i][:length] = value[index] + + # DIEN + if field == self.iid_field: + new_dict[self.neg_item_list_field] = torch.zeros(shape, dtype=dtype) + for i, (index, length) in enumerate(zip(item_list_index, item_list_length)): + new_dict[self.neg_item_list_field][i][:length] = self.neg_item_list[index] + + new_data.update(Interaction(new_dict)) + self.inter_feat = new_data diff --git a/recbole/data/utils.py b/recbole/data/utils.py index 56463a0b1..124a4a737 100644 --- a/recbole/data/utils.py +++ b/recbole/data/utils.py @@ -3,7 +3,7 @@ # @Email : houyupeng@ruc.edu.cn # UPDATE: -# @Time : 2020/10/19, 2020/9/17, 2020/8/31, 2021/2/20, 2021/3/1 +# @Time : 2021/7/9, 2020/9/17, 2020/8/31, 2021/2/20, 2021/3/1 # @Author : Yupeng Hou, Yushuo Chen, Kaiyuan Li, Haoran Cheng, Jiawei Guan # @Email : houyupeng@ruc.edu.cn, chenyushuo@ruc.edu.cn, tsotfsk@outlook.com, chenghaoran29@foxmail.com, guanjw@ruc.edu.cn @@ -225,8 +225,6 @@ def get_data_loader(name, config, neg_sample_args): type: The dataloader class that meets the requirements in :attr:`config` and :attr:`eval_setting`. """ register_table = { - 'DIN': _get_DIN_data_loader, - 'DIEN': _get_DIEN_data_loader, "MultiDAE": _get_AE_data_loader, "MultiVAE": _get_AE_data_loader, 'MacridVAE': _get_AE_data_loader, @@ -274,46 +272,6 @@ def get_data_loader(name, config, neg_sample_args): raise NotImplementedError(f'Model_type [{model_type}] has not been implemented.') -def _get_DIN_data_loader(name, config, neg_sample_args): - """Customized function for DIN to get correct dataloader class. - - Args: - name (str): The stage of dataloader. It can only take two values: 'train' or 'evaluation'. - config (Config): An instance object of Config, used to record parameter information. - neg_sample_args : Settings of negative sampling. - - Returns: - type: The dataloader class that meets the requirements in :attr:`config` and :attr:`eval_setting`. - """ - neg_sample_strategy = neg_sample_args['strategy'] - if neg_sample_strategy == 'none': - return SequentialDataLoader - elif neg_sample_strategy == 'by': - return SequentialNegSampleDataLoader - elif neg_sample_strategy == 'full': - return SequentialFullDataLoader - - -def _get_DIEN_data_loader(name, config, neg_sample_args): - """Customized function for DIEN to get correct dataloader class. - - Args: - name (str): The stage of dataloader. It can only take two values: 'train' or 'evaluation'. - config (Config): An instance object of Config, used to record parameter information. - neg_sample_args : Settings of negative sampling. - - Returns: - type: The dataloader class that meets the requirements in :attr:`config` and :attr:`eval_setting`. - """ - neg_sample_strategy = neg_sample_args['strategy'] - if neg_sample_strategy == 'none': - return DIENDataLoader - elif neg_sample_strategy == 'by': - return DIENNegSampleDataLoader - elif neg_sample_strategy == 'full': - return DIENFullDataLoader - - def _get_AE_data_loader(name, config, neg_sample_args): """Customized function for Multi-DAE and Multi-VAE to get correct dataloader class. From 9c2cbae2af5c1770091b28974d8cd57b9ea3f502 Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Fri, 9 Jul 2021 06:10:40 +0000 Subject: [PATCH 06/12] FEA: inter_matrix for seq dataset --- recbole/data/dataset/sequential_dataset.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/recbole/data/dataset/sequential_dataset.py b/recbole/data/dataset/sequential_dataset.py index 4d3e6a7c2..1f03dff3f 100644 --- a/recbole/data/dataset/sequential_dataset.py +++ b/recbole/data/dataset/sequential_dataset.py @@ -152,9 +152,19 @@ def inter_matrix(self, form='coo', value_field=None): """ if not self.uid_field or not self.iid_field: raise ValueError('dataset does not exist uid/iid, thus can not converted to sparse matrix.') - local_inter_feat = self.inter_feat - # TODO add items in the session of length 1 - raise NotImplementedError() + + l1_idx = (self.inter_feat[self.item_list_length_field] == 1) + l1_inter_dict = self.inter_feat[l1_idx].interaction + new_dict = {} + list_suffix = self.config['LIST_SUFFIX'] + candidate_field_set = set() + for field in l1_inter_dict: + if field != self.uid_field and field + list_suffix in l1_inter_dict: + candidate_field_set.add(field) + new_dict[field] = torch.cat([self.inter_feat[field], l1_inter_dict[field + list_suffix][:,0]]) + elif (not field.endswith(list_suffix)) and (field != self.item_list_length_field): + new_dict[field] = torch.cat([self.inter_feat[field], l1_inter_dict[field]]) + local_inter_feat = Interaction(new_dict) return self._create_sparse_matrix(local_inter_feat, self.uid_field, self.iid_field, form, value_field) def build(self): From 894fd33a4e6eec31b1a8852fb2d794194d7da52d Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Fri, 9 Jul 2021 06:10:54 +0000 Subject: [PATCH 07/12] FIX: test for seq loo --- tests/data/test_dataset.py | 59 +++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py index 2844c97e5..65512d45d 100644 --- a/tests/data/test_dataset.py +++ b/tests/data/test_dataset.py @@ -564,21 +564,34 @@ def test_seq_leave_one_out(self): 'training_neg_sample_num': 0 } train_dataset, valid_dataset, test_dataset = split_dataset(config_dict=config_dict) - assert (train_dataset.uid_list == [1, 1, 1, 1, 1, 2, 2, 3, 4]).all() - assert (train_dataset.item_list_index == [slice(0, 1), slice(0, 2), slice(0, 3), slice(0, 4), slice(0, 5), - slice(8, 9), slice(8, 10), slice(13, 14), slice(16, 17)]).all() - assert (train_dataset.target_index == [1, 2, 3, 4, 5, 9, 10, 14, 17]).all() - assert (train_dataset.item_list_length == [1, 2, 3, 4, 5, 1, 2, 1, 1]).all() - - assert (valid_dataset.uid_list == [1, 2]).all() - assert (valid_dataset.item_list_index == [slice(0, 6), slice(8, 11)]).all() - assert (valid_dataset.target_index == [6, 11]).all() - assert (valid_dataset.item_list_length == [6, 3]).all() - - assert (test_dataset.uid_list == [1, 2, 3]).all() - assert (test_dataset.item_list_index == [slice(0, 7), slice(8, 12), slice(13, 15)]).all() - assert (test_dataset.target_index == [7, 12, 15]).all() - assert (test_dataset.item_list_length == [7, 4, 2]).all() + assert (train_dataset.inter_feat[train_dataset.uid_field].numpy() == [1, 1, 1, 1, 1, 4, 2, 2, 3]).all() + assert (train_dataset.inter_feat[train_dataset.item_id_list_field][:,:5].numpy() == [ + [1, 0, 0, 0, 0], + [1, 2, 0, 0, 0], + [1, 2, 3, 0, 0], + [1, 2, 3, 4, 0], + [1, 2, 3, 4, 5], + [3, 0, 0, 0, 0], + [4, 0, 0, 0, 0], + [4, 5, 0, 0, 0], + [4, 0, 0, 0, 0]]).all() + assert (train_dataset.inter_feat[train_dataset.iid_field].numpy() == [2, 3, 4, 5, 6, 4, 5, 6, 5]).all() + assert (train_dataset.inter_feat[train_dataset.item_list_length_field].numpy() == [1, 2, 3, 4, 5, 1, 1, 2, 1]).all() + + assert (valid_dataset.inter_feat[valid_dataset.uid_field].numpy() == [1, 2]).all() + assert (valid_dataset.inter_feat[valid_dataset.item_id_list_field][:,:6].numpy() == [ + [1, 2, 3, 4, 5, 6], + [4, 5, 6, 0, 0, 0]]).all() + assert (valid_dataset.inter_feat[valid_dataset.iid_field].numpy() == [7, 7]).all() + assert (valid_dataset.inter_feat[valid_dataset.item_list_length_field].numpy() == [6, 3]).all() + + assert (test_dataset.inter_feat[test_dataset.uid_field].numpy() == [1, 2, 3]).all() + assert (test_dataset.inter_feat[test_dataset.item_id_list_field][:,:7].numpy() == [ + [1, 2, 3, 4, 5, 6, 7], + [4, 5, 6, 7, 0, 0, 0], + [4, 5, 0, 0, 0, 0, 0]]).all() + assert (test_dataset.inter_feat[test_dataset.iid_field].numpy() == [8, 8, 6]).all() + assert (test_dataset.inter_feat[test_dataset.item_list_length_field].numpy() == [7, 4, 2]).all() assert (train_dataset.inter_matrix().toarray() == [ [0., 0., 0., 0., 0., 0., 0., 0., 0.], @@ -589,17 +602,17 @@ def test_seq_leave_one_out(self): ]).all() assert (valid_dataset.inter_matrix().toarray() == [ [0., 0., 0., 0., 0., 0., 0., 0., 0.], - [0., 1., 1., 1., 1., 1., 1., 1., 0.], - [0., 0., 0., 0., 1., 1., 1., 1., 0.], - [0., 0., 0., 0., 1., 1., 0., 0., 0.], - [0., 0., 0., 1., 1., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 1., 0.], + [0., 0., 0., 0., 0., 0., 0., 1., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0.] ]).all() assert (test_dataset.inter_matrix().toarray() == [ [0., 0., 0., 0., 0., 0., 0., 0., 0.], - [0., 1., 1., 1., 1., 1., 1., 1., 1.], - [0., 0., 0., 0., 1., 1., 1., 1., 1.], - [0., 0., 0., 0., 1., 1., 1., 0., 0.], - [0., 0., 0., 1., 1., 0., 0., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 1.], + [0., 0., 0., 0., 0., 0., 0., 0., 1.], + [0., 0., 0., 0., 0., 0., 1., 0., 0.], + [0., 0., 0., 0., 0., 0., 0., 0., 0.] ]).all() From dc4fc149b0e092acf44a3f5cd3704332e43c7a74 Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Fri, 9 Jul 2021 06:15:36 +0000 Subject: [PATCH 08/12] FIX: update meta-data for test_dataset.py --- tests/data/test_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/data/test_dataset.py b/tests/data/test_dataset.py index 65512d45d..1ddaeb1c4 100644 --- a/tests/data/test_dataset.py +++ b/tests/data/test_dataset.py @@ -4,9 +4,9 @@ # @Email : chenyushuo@ruc.edu.cn # UPDATE -# @Time : 2020/1/3, 2021/7/1 -# @Author : Yushuo Chen, Xingyu Pan -# @email : chenyushuo@ruc.edu.cn, xy_pan@foxmail.com +# @Time : 2020/1/3, 2021/7/1, 2021/7/9 +# @Author : Yushuo Chen, Xingyu Pan, Yupeng Hou +# @email : chenyushuo@ruc.edu.cn, xy_pan@foxmail.com, houyupeng@ruc.edu.cn import logging import os From 27d5bfd12125b22b0634cca2c2029c6c1886f560 Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Fri, 9 Jul 2021 18:18:18 +0000 Subject: [PATCH 09/12] FIX: remove arg 'augmentation' --- recbole/config/configurator.py | 9 --------- recbole/data/dataset/sequential_dataset.py | 6 ++---- recbole/properties/quick_start_config/sequential.yaml | 1 - .../properties/quick_start_config/sequential_DIN.yaml | 1 - 4 files changed, 2 insertions(+), 15 deletions(-) diff --git a/recbole/config/configurator.py b/recbole/config/configurator.py index 70b84b0b2..357648162 100644 --- a/recbole/config/configurator.py +++ b/recbole/config/configurator.py @@ -284,15 +284,6 @@ def _set_default_parameters(self): else: raise ValueError('Either Model has attr \'input_type\',' 'or arg \'loss_type\' should exist in config.') - if self.final_config_dict['MODEL_TYPE'] == ModelType.SEQUENTIAL and \ - self.final_config_dict['benchmark_filename'] is not None and \ - self.final_config_dict['augmentation']: - raise ValueError( - f'Benchmark datasets for sequential model {self.model} ' - f'should be augmented in advance, which should not be augmented again and ' - f'config \'augmentation\' should be False.' - ) - eval_type = None for metric in self.final_config_dict['metrics']: if metric.lower() in individual_metrics: diff --git a/recbole/data/dataset/sequential_dataset.py b/recbole/data/dataset/sequential_dataset.py index 1f03dff3f..8ce84b70a 100644 --- a/recbole/data/dataset/sequential_dataset.py +++ b/recbole/data/dataset/sequential_dataset.py @@ -26,13 +26,11 @@ class SequentialDataset(Dataset): which can accelerate the data loader. Attributes: - augmentation (bool): Whether the interactions should be augmented in RecBole. max_item_list_len (int): Max length of historical item list. item_list_length_field (str): Field name for item lists' length. """ def __init__(self, config): - self.augmentation = config['augmentation'] self.max_item_list_len = config['MAX_ITEM_LIST_LENGTH'] self.item_list_length_field = config['ITEM_LIST_LENGTH_FIELD'] super().__init__(config) @@ -42,8 +40,8 @@ def _change_feat_format(self): then perform data augmentation. """ super()._change_feat_format() - - if not self.augmentation: + + if self.config['benchmark_filename'] is not None: return self.logger.debug('Augmentation for sequential recommendation.') self.data_augmentation() diff --git a/recbole/properties/quick_start_config/sequential.yaml b/recbole/properties/quick_start_config/sequential.yaml index 2370f18b4..d9e144449 100644 --- a/recbole/properties/quick_start_config/sequential.yaml +++ b/recbole/properties/quick_start_config/sequential.yaml @@ -2,4 +2,3 @@ eval_args: split: {'LS': 2} order: TO mode: full -augmentation: True diff --git a/recbole/properties/quick_start_config/sequential_DIN.yaml b/recbole/properties/quick_start_config/sequential_DIN.yaml index 3e7f4a681..8d6b01edc 100644 --- a/recbole/properties/quick_start_config/sequential_DIN.yaml +++ b/recbole/properties/quick_start_config/sequential_DIN.yaml @@ -4,4 +4,3 @@ eval_args: mode: uni100 metrics: ['AUC', 'LogLoss'] valid_metric: AUC -augmentation: True \ No newline at end of file From d00852d081e2b7378794c0776c88a66d1316cb5d Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Sat, 10 Jul 2021 02:19:57 +0800 Subject: [PATCH 10/12] Update configurator.py --- recbole/config/configurator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recbole/config/configurator.py b/recbole/config/configurator.py index 357648162..f2e15b21d 100644 --- a/recbole/config/configurator.py +++ b/recbole/config/configurator.py @@ -3,7 +3,7 @@ # @Email : linzihan.super@foxmail.com # UPDATE -# @Time : 2020/10/04, 2021/7/8, 2021/2/17, 2021/6/30 +# @Time : 2020/10/04, 2021/3/2, 2021/2/17, 2021/6/30 # @Author : Shanlei Mu, Yupeng Hou, Jiawei Guan, Xingyu Pan # @Email : slmu@ruc.edu.cn, houyupeng@ruc.edu.cn, Guanjw@ruc.edu.cn, xy_pan@foxmail.com From 497327dbe10b3d25bb61dfbeebee2381b7255c4b Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Sat, 10 Jul 2021 03:22:23 +0000 Subject: [PATCH 11/12] FIX: remove arg augmentation --- recbole/properties/dataset/sample.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/recbole/properties/dataset/sample.yaml b/recbole/properties/dataset/sample.yaml index ff4ac4915..d9869e5e2 100644 --- a/recbole/properties/dataset/sample.yaml +++ b/recbole/properties/dataset/sample.yaml @@ -43,7 +43,6 @@ normalize_field: ~ normalize_all: ~ # Sequential Model Needed -augmentation: False ITEM_LIST_LENGTH_FIELD: item_length LIST_SUFFIX: _list MAX_ITEM_LIST_LENGTH: 50 From 99c8d591735e549ac2ffa09aba46c569266a9d70 Mon Sep 17 00:00:00 2001 From: Yupeng Hou Date: Sat, 10 Jul 2021 11:25:11 +0800 Subject: [PATCH 12/12] Update argument_list.py --- recbole/utils/argument_list.py | 1 - 1 file changed, 1 deletion(-) diff --git a/recbole/utils/argument_list.py b/recbole/utils/argument_list.py index 31b719989..632f442bd 100644 --- a/recbole/utils/argument_list.py +++ b/recbole/utils/argument_list.py @@ -45,7 +45,6 @@ 'ITEM_LIST_LENGTH_FIELD', 'LIST_SUFFIX', 'MAX_ITEM_LIST_LENGTH', 'POSITION_FIELD', 'HEAD_ENTITY_ID_FIELD', 'TAIL_ENTITY_ID_FIELD', 'RELATION_ID_FIELD', 'ENTITY_ID_FIELD', 'load_col', 'unload_col', 'unused_col', 'additional_feat_suffix', - 'augmentation', 'max_user_inter_num', 'min_user_inter_num', 'max_item_inter_num', 'min_item_inter_num', 'lowest_val', 'highest_val', 'equal_val', 'not_equal_val', 'fields_in_same_space',