maciejkula · nikisix · Jul 11, 2018 · Jul 17, 2018 · Jul 17, 2018 · Jul 18, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# spotlight results files
+*_results.txt
+
 *~
 *#*
 
@@ -18,3 +21,4 @@
 
 # IDE
 tags
+cscope.out
diff --git a/spotlight/datasets/synthetic.py b/spotlight/datasets/synthetic.py
@@ -69,7 +69,9 @@ def generate_sequential(num_users=100,
                         num_interactions=10000,
                         concentration_parameter=0.1,
                         order=3,
-                        random_state=None):
+                        random_state=None,
+                        weight_type=None
+                        ):
     """
     Generate a dataset of user-item interactions where sequential
     information matters.
@@ -100,6 +102,8 @@ def generate_sequential(num_users=100,
         order of the Markov chain
     random_state: numpy.random.RandomState, optional
         random state used to generate the data
+    weight_type: string, optional
+        Must be: ones, zeros, or high
 
     Returns
     -------
@@ -108,6 +112,22 @@ def generate_sequential(num_users=100,
         instance of the interactions class
     """
 
+    weights = None
+    weight_types = ['ones', 'zeros', 'high']
+    if weight_type is not None:
+        if weight_type not in weight_types:
+            raise ValueError(
+                "weight_type {} not in {}"
+                .format(weight_type, weight_types)
+            )
+        if weight_type == 'ones':
+            weights = np.ones(num_interactions)
+        elif weight_type == 'zeros':
+            weights = np.zeros(num_interactions)
+        elif weight_type == 'high':
+            large_weight = 1E9
+            weights = large_weight * np.ones(num_interactions)
+
     if random_state is None:
         random_state = np.random.RandomState()
 
@@ -132,4 +152,6 @@ def generate_sequential(num_users=100,
                         ratings=ratings,
                         timestamps=timestamps,
                         num_users=num_users,
-                        num_items=num_items)
+                        num_items=num_items,
+                        weights=weights
+                        )
diff --git a/spotlight/factorization/implicit.py b/spotlight/factorization/implicit.py
@@ -183,7 +183,7 @@ def _check_input(self, user_ids, item_ids, allow_items_none=False):
 
     def fit(self, interactions, verbose=False):
         """
-        Fit the model.
+        Fit the model using sample weights.
 
         When called repeatedly, model fitting will resume from
         the point at which training stopped in the previous fit
@@ -198,9 +198,11 @@ def fit(self, interactions, verbose=False):
         verbose: bool
             Output additional information about current epoch and loss.
         """
-
         user_ids = interactions.user_ids.astype(np.int64)
         item_ids = interactions.item_ids.astype(np.int64)
+        sample_weights = None
+        if interactions.weights is not None:
+            sample_weights = interactions.weights.astype(np.float32)
 
         if not self._initialized:
             self._initialize(interactions)
@@ -209,22 +211,41 @@ def fit(self, interactions, verbose=False):
 
         for epoch_num in range(self._n_iter):
 
-            users, items = shuffle(user_ids,
-                                   item_ids,
-                                   random_state=self._random_state)
+            users, items, sample_weights = shuffle(
+                user_ids,
+                item_ids,
+                sample_weights,
+                random_state=self._random_state
+            )
 
             user_ids_tensor = gpu(torch.from_numpy(users),
                                   self._use_cuda)
             item_ids_tensor = gpu(torch.from_numpy(items),
                                   self._use_cuda)
+            sample_weights_tensor = None
+            if sample_weights is not None:
+                sample_weights_tensor = gpu(
+                    torch.from_numpy(sample_weights),
+                    self._use_cuda
+                )
 
             epoch_loss = 0.0
 
-            for (minibatch_num,
-                 (batch_user,
-                  batch_item)) in enumerate(minibatch(user_ids_tensor,
-                                                      item_ids_tensor,
-                                                      batch_size=self._batch_size)):
+            for (
+                    minibatch_num,
+                    (
+                        batch_user,
+                        batch_item,
+                        batch_sample_weights
+                    )
+            ) in enumerate(
+                minibatch(
+                    user_ids_tensor,
+                    item_ids_tensor,
+                    sample_weights_tensor,
+                    batch_size=self._batch_size
+                )
+            ):
 
                 positive_prediction = self._net(batch_user, batch_item)
 
@@ -236,7 +257,11 @@ def fit(self, interactions, verbose=False):
 
                 self._optimizer.zero_grad()
 
-                loss = self._loss_func(positive_prediction, negative_prediction)
+                loss = self._loss_func(
+                    positive_prediction,
+                    negative_prediction,
+                    sample_weights=batch_sample_weights
+                )
                 epoch_loss += loss.item()
 
                 loss.backward()

diff --git a/spotlight/interactions.py b/spotlight/interactions.py
@@ -14,7 +14,7 @@ def _sliding_window(tensor, window_size, step_size=1):
         yield tensor[max(i - window_size, 0):i]
 
 
-def _generate_sequences(user_ids, item_ids,
+def _generate_sequences(user_ids, sequence_elements,
                         indices,
                         max_sequence_length,
                         step_size):
@@ -28,7 +28,7 @@ def _generate_sequences(user_ids, item_ids,
         else:
             stop_idx = indices[i + 1]
 
-        for seq in _sliding_window(item_ids[start_idx:stop_idx],
+        for seq in _sliding_window(sequence_elements[start_idx:stop_idx],
                                    max_sequence_length,
                                    step_size):
 
@@ -63,7 +63,7 @@ class Interactions(object):
     timestamps: array of np.int32, optional
         array of timestamps
     weights: array of np.float32, optional
-        array of weights
+        array of sample importance weights
     num_users: int, optional
         Number of distinct users in the dataset.
         Must be larger than the maximum user id
@@ -85,7 +85,7 @@ class Interactions(object):
     timestamps: array of np.int32, optional
         array of timestamps
     weights: array of np.float32, optional
-        array of weights
+        array of sample importance weights
     num_users: int, optional
         Number of distinct users in the dataset.
     num_items: int, optional
@@ -218,7 +218,7 @@ def to_sequence(self, max_sequence_length=10, min_sequence_length=None, step_siz
         sequence interactions: :class:`~SequenceInteractions`
             The resulting sequence interactions.
         """
-
+        weighted = self.weights is not None
         if self.timestamps is None:
             raise ValueError('Cannot convert to sequences, '
                              'timestamps not available.')
@@ -236,6 +236,8 @@ def to_sequence(self, max_sequence_length=10, min_sequence_length=None, step_siz
 
         user_ids = self.user_ids[sort_indices]
         item_ids = self.item_ids[sort_indices]
+        if weighted:
+            weights = self.weights[sort_indices]
 
         user_ids, indices, counts = np.unique(user_ids,
                                               return_index=True,
@@ -245,6 +247,10 @@ def to_sequence(self, max_sequence_length=10, min_sequence_length=None, step_siz
 
         sequences = np.zeros((num_subsequences, max_sequence_length),
                              dtype=np.int32)
+        weight_sequences = None
+        if weighted:
+            weight_sequences = np.zeros((num_subsequences, max_sequence_length),
+                                        dtype=np.int32)
         sequence_users = np.empty(num_subsequences,
                                   dtype=np.int32)
         for i, (uid,
@@ -256,13 +262,25 @@ def to_sequence(self, max_sequence_length=10, min_sequence_length=None, step_siz
             sequences[i][-len(seq):] = seq
             sequence_users[i] = uid
 
+        if weighted:
+            for i, (uid,
+                    seq) in enumerate(_generate_sequences(user_ids,
+                                                          weights,
+                                                          indices,
+                                                          max_sequence_length,
+                                                          step_size)):
+                weight_sequences[i][-len(seq):] = seq
+
         if min_sequence_length is not None:
             long_enough = sequences[:, -min_sequence_length] != 0
             sequences = sequences[long_enough]
             sequence_users = sequence_users[long_enough]
+            if weighted:
+                weight_sequences = weight_sequences[long_enough]
 
         return (SequenceInteractions(sequences,
                                      user_ids=sequence_users,
+                                     weight_sequences=weight_sequences,
                                      num_items=self.num_items))
 
 
@@ -276,6 +294,11 @@ class SequenceInteractions(object):
     sequences: array of np.int32 of shape (num_sequences x max_sequence_length)
         The interactions sequence matrix, as produced by
         :func:`~Interactions.to_sequence`
+    user_ids: array of np.int32, optional
+        user_id represented by a sequence of item_ids.
+    weight_sequences: array of np.int32 of shape
+        (num_sequences x max_sequence_length), optional.
+        Sequence of sample weights.
     num_items: int, optional
         The number of distinct items in the data
 
@@ -287,11 +310,11 @@ class SequenceInteractions(object):
         :func:`~Interactions.to_sequence`
     """
 
-    def __init__(self,
-                 sequences,
-                 user_ids=None, num_items=None):
+    def __init__(self, sequences,
+                 user_ids=None, weight_sequences=None, num_items=None):
 
         self.sequences = sequences
+        self.weight_sequences = weight_sequences
         self.user_ids = user_ids
         self.max_sequence_length = sequences.shape[1]