RUCAIBox · 2017pxy · Jul 6, 2021 · Jul 5, 2021 · Jul 5, 2021 · Jul 5, 2021
diff --git a/docs/source/user_guide/data/atomic_files.rst b/docs/source/user_guide/data/atomic_files.rst
@@ -141,4 +141,4 @@ For example, if you want to map the tokens of ``ent_id`` into the same space of
         # inter/user/item/...: As usual
         ent: [ent_id, ent_emb]
 
-    fields_in_same_space: [[ent_id, entity_id]]
+    alias_of_entity_id: [ent_id]
diff --git a/docs/source/user_guide/data/data_args.rst b/docs/source/user_guide/data/data_args.rst
@@ -88,7 +88,10 @@ Filter by number of interactions
 Preprocessing
 -----------------
 
-- ``fields_in_same_space (list)`` : List of spaces. Space is a list of string similar to the fields' names. The fields in the same space will be remapped into the same index system. Note that if you want to make some fields remapped in the same space with entities, then just set ``fields_in_same_space = [entity_id, xxx, ...]``. (if ``ENTITY_ID_FIELD != 'entity_id'``, then change the ``'entity_id'`` in the above example.) Defaults to ``None``.
+- ``alias_of_user_id (list)``: List of fields' names, which will be remapped into the same index system with ``USER_ID_FIELD``. Defaults to ``None``.
+- ``alias_of_item_id (list)``: List of fields' names, which will be remapped into the same index system with ``ITEM_ID_FIELD``. Defaults to ``None``.
+- ``alias_of_entity_id (list)``: List of fields' names, which will be remapped into the same index system with ``ENTITY_ID_FIELD``, ``HEAD_ENTITY_ID_FIELD`` and ``TAIL_ENTITY_ID_FIELD``. Defaults to ``None``.
+- ``alias_of_relation_id (list)``: List of fields' names, which will be remapped into the same index system with ``RELATION_ID_FIELD``. Defaults to ``None``.
 - ``preload_weight (dict)`` : Has the format ``{k (str): v (float)}, ...``. ``k`` if a token field, representing the IDs of each row of preloaded weight matrix. ``v`` is a float like fields. Each pair of ``u`` and ``v`` should be from the same atomic file. This arg can be used to load pretrained vectors. Defaults to ``None``.
 - ``normalize_field (list)`` : List of filed names to be normalized. Note that only float like fields can be normalized. Defaults to ``None``.
 - ``normalize_all (bool)`` : Normalize all the float like fields if ``True``. Defaults to ``True``.

diff --git a/docs/source/user_guide/model/sequential/gru4reckg.rst b/docs/source/user_guide/model/sequential/gru4reckg.rst
@@ -46,9 +46,7 @@ And then:
             kg: [head_id, relation_id, tail_id]
             link: [item_id, entity_id]
             ent_feature: [ent_id, ent_vec]
-        fields_in_same_space: [
-            [ent_id, entity_id]
-        ]
+        alias_of_entity_id: [ent_id]
         preload_weight:
             ent_id: ent_vec
         additional_feat_suffix: [ent_feature]

diff --git a/docs/source/user_guide/model/sequential/ksr.rst b/docs/source/user_guide/model/sequential/ksr.rst
@@ -58,10 +58,8 @@ And then:
             link: [item_id, entity_id]
             ent_feature: [ent_id, ent_vec]
             rel_feature: [rel_id, rel_vec]
-        fields_in_same_space: [
-            [ent_id, entity_id]
-            [rel_id, relation_id]
-        ]
+        alias_of_entity_id: [ent_id]
+        alias_of_relation_id: [rel_id]
         preload_weight:
             ent_id: ent_vec
             rel_id: rel_vec

diff --git a/docs/source/user_guide/usage/load_pretrained_embedding.rst b/docs/source/user_guide/usage/load_pretrained_embedding.rst
@@ -22,9 +22,9 @@ Secondly, update the args as (suppose that ``USER_ID_FIELD: user_id``):
     load_col:
         # inter/user/item/...: As usual
         useremb: [uid, user_emb]
-    fields_in_same_space: [[uid, user_id]]
+    alias_of_user_id: [uid]
     preload_weight: 
-    	uid: user_emb
+        uid: user_emb
 
 Then, this additional embedding feature file will be loaded into the :class:`Dataset` object. These new features can be accessed as following:
 
@@ -39,6 +39,6 @@ In your model, user embedding matrix can be initialized by your pre-trained embe
 
     class YourModel(GeneralRecommender):
         def __init__(self, config, dataset):
-        	pretrained_user_emb = dataset.get_preload_weight('uid')
-        	self.user_embedding = nn.Embedding.from_pretrained(torch.from_numpy(pretrained_user_emb))
+            pretrained_user_emb = dataset.get_preload_weight('uid')
+            self.user_embedding = nn.Embedding.from_pretrained(torch.from_numpy(pretrained_user_emb))
 
diff --git a/recbole/data/dataset/dataset.py b/recbole/data/dataset/dataset.py
@@ -105,6 +105,7 @@ def _from_scratch(self):
         self._get_preset()
         self._get_field_from_config()
         self._load_data(self.dataset_name, self.dataset_path)
+        self._init_alias()
         self._data_processing()
 
     def _get_preset(self):
@@ -117,6 +118,7 @@ def _get_preset(self):
         self.field2id_token = {}
         self.field2token_id = {}
         self.field2seqlen = self.config['seq_len'] or {}
+        self.alias = {}
         self._preloaded_weight = {}
         self.benchmark_filename_list = self.config['benchmark_filename']
 
@@ -441,6 +443,34 @@ def _load_feat(self, filepath, source):
             self.field2seqlen[field] = max(map(len, df[field].values))
         return df
 
+    def _set_alias(self, alias_name, default_value):
+        alias = self.config[f'alias_of_{alias_name}'] or []
+        alias = np.array(default_value + alias)
+        _, idx = np.unique(alias, return_index=True)
+        self.alias[alias_name] = alias[np.sort(idx)]
+
+    def _init_alias(self):
+        """Set :attr:`alias_of_user_id` and :attr:`alias_of_item_id`. And set :attr:`_rest_fields`.
-        """Set :attr:`alias_of_user_id` and :attr:`alias_of_item_id`. And set :attr:`_rest_fields`.
+        """Set :attr:`alias_of_user_id`, :attr:`alias_of_item_id` and :attr:`_rest_fields`.
-        """Set :attr:`alias_of_user_id` and :attr:`alias_of_item_id`. And set :attr:`_rest_fields`.
+        """Set :attr:`alias_of_user_id`, :attr:`alias_of_item_id` and :attr:`_rest_fields`.
+        """
+        self._set_alias('user_id', [self.uid_field])
+        self._set_alias('item_id', [self.iid_field])
+
+        for alias_name_1, alias_1 in self.alias.items():
+            for alias_name_2, alias_2 in self.alias.items():
+                if alias_name_1 != alias_name_2:
+                    intersect = np.intersect1d(alias_1, alias_2, assume_unique=True)
+                    if len(intersect) > 0:
+                        raise ValueError(f'`alias_of_{alias_name_1}` and `alias_of_{alias_name_2}` '
+                                         f'should not have the same field {list(intersect)}.')
+
+        self._rest_fields = self.token_like_fields
+        for alias_name, alias in self.alias.items():
+            isin = np.isin(alias, self._rest_fields, assume_unique=True)
+            if isin.all() is False:
+                raise ValueError(f'`alias_of_{alias_name}` should not contain '
+                                 f'non-token-like field {list(alias[~isin])}.')
+            self._rest_fields = np.setdiff1d(self._rest_fields, alias, assume_unique=True)
+
     def _user_item_feat_preparation(self):
         """Sort :attr:`user_feat` and :attr:`item_feat` by ``user_id`` or ``item_id``.
         Missing values will be filled later.
@@ -860,49 +890,15 @@ def _set_label_by_threshold(self):
                 raise ValueError(f'Field [{field}] not in inter_feat.')
             self._del_col(self.inter_feat, field)
 
-    def _get_fields_in_same_space(self):
-        """Parsing ``config['fields_in_same_space']``. See :doc:`../user_guide/data/data_args` for detail arg setting.
-
-        Note:
-            - Each field can only exist ONCE in ``config['fields_in_same_space']``.
-            - user_id and item_id can not exist in ``config['fields_in_same_space']``.
-            - only token-like fields can exist in ``config['fields_in_same_space']``.
-        """
-        fields_in_same_space = self.config['fields_in_same_space'] or []
-        fields_in_same_space = [set(_) for _ in fields_in_same_space]
-        additional = []
-        token_like_fields = self.token_like_fields
-        for field in token_like_fields:
-            count = 0
-            for field_set in fields_in_same_space:
-                if field in field_set:
-                    count += 1
-            if count == 0:
-                additional.append({field})
-            elif count == 1:
-                continue
-            else:
-                raise ValueError(f'Field [{field}] occurred in `fields_in_same_space` more than one time.')
-
-        for field_set in fields_in_same_space:
-            if self.uid_field in field_set and self.iid_field in field_set:
-                raise ValueError('uid_field and iid_field can\'t in the same ID space')
-            for field in field_set:
-                if field not in token_like_fields:
-                    raise ValueError(f'Field [{field}] is not a token-like field.')
-
-        fields_in_same_space.extend(additional)
-        return fields_in_same_space
-
-    def _get_remap_list(self, field_set):
+    def _get_remap_list(self, field_list):
         """Transfer set of fields in the same remapping space into remap list.
 
         If ``uid_field`` or ``iid_field`` in ``field_set``,
         field in :attr:`inter_feat` will be remapped firstly,
         then field in :attr:`user_feat` or :attr:`item_feat` will be remapped next, finally others.
 
         Args:
-            field_set (set): Set of fields in the same remapping space
+            field_list (numpy.ndarray): List of fields in the same remapping space.
 
         Returns:
             list:
@@ -912,29 +908,23 @@ def _get_remap_list(self, field_set):
 
             They will be concatenated in order, and remapped together.
         """
+
         remap_list = []
-        for field, feat in zip([self.uid_field, self.iid_field], [self.user_feat, self.item_feat]):
-            if field in field_set:
-                field_set.remove(field)
-                remap_list.append((self.inter_feat, field, FeatureType.TOKEN))
-                if feat is not None:
-                    remap_list.append((feat, field, FeatureType.TOKEN))
-        for field in field_set:
-            source = self.field2source[field]
-            if isinstance(source, FeatureSource):
-                source = source.value
-            feat = getattr(self, f'{source}_feat')
+        for field in field_list:
             ftype = self.field2type[field]
-            remap_list.append((feat, field, ftype))
+            for feat in self.field2feats(field):
+                remap_list.append((feat, field, ftype))
         return remap_list
 
     def _remap_ID_all(self):
-        """Get ``config['fields_in_same_space']`` firstly, and remap each.
+        """Remap all token-like fields.
         """
-        fields_in_same_space = self._get_fields_in_same_space()
-        self.logger.debug(set_color('fields_in_same_space', 'blue') + f': {fields_in_same_space}')
-        for field_set in fields_in_same_space:
-            remap_list = self._get_remap_list(field_set)
+        for alias in self.alias.values():
+            remap_list = self._get_remap_list(alias)
+            self._remap(remap_list)
+
+        for field in self._rest_fields:
+            remap_list = self._get_remap_list(np.array([field]))
             self._remap(remap_list)
 
     def _concat_remaped_tokens(self, remap_list):
@@ -1087,6 +1077,24 @@ def copy_field_property(self, dest_field, source_field):
         self.field2source[dest_field] = self.field2source[source_field]
         self.field2seqlen[dest_field] = self.field2seqlen[source_field]
 
+    def field2feats(self, field):
+        if field not in self.field2source:
+            raise ValueError(f'Field [{field}] not defined in dataset.')
+        if field == self.uid_field:
+            feats = [self.inter_feat]
+            if self.user_feat is not None:
+                feats.append(self.user_feat)
+        elif field == self.iid_field:
+            feats = [self.inter_feat]
+            if self.item_feat is not None:
+                feats.append(self.item_feat)
+        else:
+            source = self.field2source[field]
+            if not isinstance(source, str):
+                source = source.value
+            feats = [getattr(self, f'{source}_feat')]
+        return feats
+
     def token2id(self, field, tokens):
         """Map external tokens to internal ids.