Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REFACTOR: refactor remap_ID_all #868

Merged
merged 4 commits into from
Jul 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/user_guide/data/atomic_files.rst
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,4 @@ For example, if you want to map the tokens of ``ent_id`` into the same space of
# inter/user/item/...: As usual
ent: [ent_id, ent_emb]

fields_in_same_space: [[ent_id, entity_id]]
alias_of_entity_id: [ent_id]
5 changes: 4 additions & 1 deletion docs/source/user_guide/data/data_args.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,10 @@ Filter by number of interactions
Preprocessing
-----------------

- ``fields_in_same_space (list)`` : List of spaces. Space is a list of string similar to the fields' names. The fields in the same space will be remapped into the same index system. Note that if you want to make some fields remapped in the same space with entities, then just set ``fields_in_same_space = [entity_id, xxx, ...]``. (if ``ENTITY_ID_FIELD != 'entity_id'``, then change the ``'entity_id'`` in the above example.) Defaults to ``None``.
- ``alias_of_user_id (list)``: List of fields' names, which will be remapped into the same index system with ``USER_ID_FIELD``. Defaults to ``None``.
- ``alias_of_item_id (list)``: List of fields' names, which will be remapped into the same index system with ``ITEM_ID_FIELD``. Defaults to ``None``.
- ``alias_of_entity_id (list)``: List of fields' names, which will be remapped into the same index system with ``ENTITY_ID_FIELD``, ``HEAD_ENTITY_ID_FIELD`` and ``TAIL_ENTITY_ID_FIELD``. Defaults to ``None``.
- ``alias_of_relation_id (list)``: List of fields' names, which will be remapped into the same index system with ``RELATION_ID_FIELD``. Defaults to ``None``.
- ``preload_weight (dict)`` : Has the format ``{k (str): v (float)}, ...``. ``k`` if a token field, representing the IDs of each row of preloaded weight matrix. ``v`` is a float like fields. Each pair of ``u`` and ``v`` should be from the same atomic file. This arg can be used to load pretrained vectors. Defaults to ``None``.
- ``normalize_field (list)`` : List of filed names to be normalized. Note that only float like fields can be normalized. Defaults to ``None``.
- ``normalize_all (bool)`` : Normalize all the float like fields if ``True``. Defaults to ``True``.
Expand Down
4 changes: 1 addition & 3 deletions docs/source/user_guide/model/sequential/gru4reckg.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ And then:
kg: [head_id, relation_id, tail_id]
link: [item_id, entity_id]
ent_feature: [ent_id, ent_vec]
fields_in_same_space: [
[ent_id, entity_id]
]
alias_of_entity_id: [ent_id]
preload_weight:
ent_id: ent_vec
additional_feat_suffix: [ent_feature]
Expand Down
6 changes: 2 additions & 4 deletions docs/source/user_guide/model/sequential/ksr.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,8 @@ And then:
link: [item_id, entity_id]
ent_feature: [ent_id, ent_vec]
rel_feature: [rel_id, rel_vec]
fields_in_same_space: [
[ent_id, entity_id]
[rel_id, relation_id]
]
alias_of_entity_id: [ent_id]
alias_of_relation_id: [rel_id]
preload_weight:
ent_id: ent_vec
rel_id: rel_vec
Expand Down
8 changes: 4 additions & 4 deletions docs/source/user_guide/usage/load_pretrained_embedding.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ Secondly, update the args as (suppose that ``USER_ID_FIELD: user_id``):
load_col:
# inter/user/item/...: As usual
useremb: [uid, user_emb]
fields_in_same_space: [[uid, user_id]]
alias_of_user_id: [uid]
preload_weight:
uid: user_emb
uid: user_emb

Then, this additional embedding feature file will be loaded into the :class:`Dataset` object. These new features can be accessed as following:

Expand All @@ -39,6 +39,6 @@ In your model, user embedding matrix can be initialized by your pre-trained embe

class YourModel(GeneralRecommender):
def __init__(self, config, dataset):
pretrained_user_emb = dataset.get_preload_weight('uid')
self.user_embedding = nn.Embedding.from_pretrained(torch.from_numpy(pretrained_user_emb))
pretrained_user_emb = dataset.get_preload_weight('uid')
self.user_embedding = nn.Embedding.from_pretrained(torch.from_numpy(pretrained_user_emb))

114 changes: 61 additions & 53 deletions recbole/data/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def _from_scratch(self):
self._get_preset()
self._get_field_from_config()
self._load_data(self.dataset_name, self.dataset_path)
self._init_alias()
self._data_processing()

def _get_preset(self):
Expand All @@ -117,6 +118,7 @@ def _get_preset(self):
self.field2id_token = {}
self.field2token_id = {}
self.field2seqlen = self.config['seq_len'] or {}
self.alias = {}
self._preloaded_weight = {}
self.benchmark_filename_list = self.config['benchmark_filename']

Expand Down Expand Up @@ -441,6 +443,34 @@ def _load_feat(self, filepath, source):
self.field2seqlen[field] = max(map(len, df[field].values))
return df

def _set_alias(self, alias_name, default_value):
alias = self.config[f'alias_of_{alias_name}'] or []
alias = np.array(default_value + alias)
_, idx = np.unique(alias, return_index=True)
self.alias[alias_name] = alias[np.sort(idx)]

def _init_alias(self):
"""Set :attr:`alias_of_user_id` and :attr:`alias_of_item_id`. And set :attr:`_rest_fields`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"""Set :attr:`alias_of_user_id` and :attr:`alias_of_item_id`. And set :attr:`_rest_fields`.
"""Set :attr:`alias_of_user_id`, :attr:`alias_of_item_id` and :attr:`_rest_fields`.

"""
self._set_alias('user_id', [self.uid_field])
self._set_alias('item_id', [self.iid_field])

for alias_name_1, alias_1 in self.alias.items():
for alias_name_2, alias_2 in self.alias.items():
if alias_name_1 != alias_name_2:
intersect = np.intersect1d(alias_1, alias_2, assume_unique=True)
if len(intersect) > 0:
raise ValueError(f'`alias_of_{alias_name_1}` and `alias_of_{alias_name_2}` '
f'should not have the same field {list(intersect)}.')

self._rest_fields = self.token_like_fields
for alias_name, alias in self.alias.items():
isin = np.isin(alias, self._rest_fields, assume_unique=True)
if isin.all() is False:
raise ValueError(f'`alias_of_{alias_name}` should not contain '
f'non-token-like field {list(alias[~isin])}.')
self._rest_fields = np.setdiff1d(self._rest_fields, alias, assume_unique=True)

def _user_item_feat_preparation(self):
"""Sort :attr:`user_feat` and :attr:`item_feat` by ``user_id`` or ``item_id``.
Missing values will be filled later.
Expand Down Expand Up @@ -860,49 +890,15 @@ def _set_label_by_threshold(self):
raise ValueError(f'Field [{field}] not in inter_feat.')
self._del_col(self.inter_feat, field)

def _get_fields_in_same_space(self):
"""Parsing ``config['fields_in_same_space']``. See :doc:`../user_guide/data/data_args` for detail arg setting.

Note:
- Each field can only exist ONCE in ``config['fields_in_same_space']``.
- user_id and item_id can not exist in ``config['fields_in_same_space']``.
- only token-like fields can exist in ``config['fields_in_same_space']``.
"""
fields_in_same_space = self.config['fields_in_same_space'] or []
fields_in_same_space = [set(_) for _ in fields_in_same_space]
additional = []
token_like_fields = self.token_like_fields
for field in token_like_fields:
count = 0
for field_set in fields_in_same_space:
if field in field_set:
count += 1
if count == 0:
additional.append({field})
elif count == 1:
continue
else:
raise ValueError(f'Field [{field}] occurred in `fields_in_same_space` more than one time.')

for field_set in fields_in_same_space:
if self.uid_field in field_set and self.iid_field in field_set:
raise ValueError('uid_field and iid_field can\'t in the same ID space')
for field in field_set:
if field not in token_like_fields:
raise ValueError(f'Field [{field}] is not a token-like field.')

fields_in_same_space.extend(additional)
return fields_in_same_space

def _get_remap_list(self, field_set):
def _get_remap_list(self, field_list):
"""Transfer set of fields in the same remapping space into remap list.

If ``uid_field`` or ``iid_field`` in ``field_set``,
field in :attr:`inter_feat` will be remapped firstly,
then field in :attr:`user_feat` or :attr:`item_feat` will be remapped next, finally others.

Args:
field_set (set): Set of fields in the same remapping space
field_list (numpy.ndarray): List of fields in the same remapping space.

Returns:
list:
Expand All @@ -912,29 +908,23 @@ def _get_remap_list(self, field_set):

They will be concatenated in order, and remapped together.
"""

remap_list = []
for field, feat in zip([self.uid_field, self.iid_field], [self.user_feat, self.item_feat]):
if field in field_set:
field_set.remove(field)
remap_list.append((self.inter_feat, field, FeatureType.TOKEN))
if feat is not None:
remap_list.append((feat, field, FeatureType.TOKEN))
for field in field_set:
source = self.field2source[field]
if isinstance(source, FeatureSource):
source = source.value
feat = getattr(self, f'{source}_feat')
for field in field_list:
ftype = self.field2type[field]
remap_list.append((feat, field, ftype))
for feat in self.field2feats(field):
remap_list.append((feat, field, ftype))
return remap_list

def _remap_ID_all(self):
"""Get ``config['fields_in_same_space']`` firstly, and remap each.
"""Remap all token-like fields.
"""
fields_in_same_space = self._get_fields_in_same_space()
self.logger.debug(set_color('fields_in_same_space', 'blue') + f': {fields_in_same_space}')
for field_set in fields_in_same_space:
remap_list = self._get_remap_list(field_set)
for alias in self.alias.values():
remap_list = self._get_remap_list(alias)
self._remap(remap_list)

for field in self._rest_fields:
remap_list = self._get_remap_list(np.array([field]))
self._remap(remap_list)

def _concat_remaped_tokens(self, remap_list):
Expand Down Expand Up @@ -1087,6 +1077,24 @@ def copy_field_property(self, dest_field, source_field):
self.field2source[dest_field] = self.field2source[source_field]
self.field2seqlen[dest_field] = self.field2seqlen[source_field]

def field2feats(self, field):
if field not in self.field2source:
raise ValueError(f'Field [{field}] not defined in dataset.')
if field == self.uid_field:
feats = [self.inter_feat]
if self.user_feat is not None:
feats.append(self.user_feat)
elif field == self.iid_field:
feats = [self.inter_feat]
if self.item_feat is not None:
feats.append(self.item_feat)
else:
source = self.field2source[field]
if not isinstance(source, str):
source = source.value
feats = [getattr(self, f'{source}_feat')]
return feats

def token2id(self, field, tokens):
"""Map external tokens to internal ids.

Expand Down
Loading