pyg-team · rusty1s · Jun 5, 2023 · Jun 5, 2023 · Jun 5, 2023 · Jun 5, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added the `HM` personalized fashion recommendation dataset ([#7515](https://github.com/pyg-team/pytorch_geometric/pull/7515))
 - Added the `GraphMixer` model ([#7501](https://github.com/pyg-team/pytorch_geometric/pull/7501))
 - Added the `disable_dynamic_shape` experimental flag ([#7246](https://github.com/pyg-team/pytorch_geometric/pull/7246))
 - Added the option to override `use_segmm` selection in `HeteroLinear` ([#7474](https://github.com/pyg-team/pytorch_geometric/pull/7474))

@@ -85,6 +85,7 @@
 from .taobao import Taobao
 from .igmc_dataset import IGMCDataset
 from .amazon_book import AmazonBook
+from .hm import HM
 
 from .fake import FakeDataset, FakeHeteroDataset
 from .sbm_dataset import StochasticBlockModelDataset
@@ -188,6 +189,7 @@
     'Taobao',
     'IGMCDataset',
     'AmazonBook',
+    'HM',
 ]
 synthetic_datasets = [
     'FakeDataset',

@@ -0,0 +1,162 @@
+from typing import Callable, List, Optional
+
+import torch
+
+from torch_geometric.data import HeteroData, InMemoryDataset
+
+
+class HM(InMemoryDataset):
+    r"""The heterogeneous H&M dataset from the `Kaggle H&M Personalized Fashion
+    Recommendations
+    <https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations>`_
+    challenge.
+    The task is to develop product recommendations based on data from previous
+    transactions, as well as from customer and product meta data.
+
+    Args:
+        root (str): Root directory where the dataset should be saved.
+        use_all_tables_as_node_types (bool, optional): If set to :obj:`True`,
+            will use the transaction table as a distinct node type.
+            (default: :obj:`False`)
+        transform (callable, optional): A function/transform that takes in an
+            :obj:`torch_geometric.data.HeteroData` object and returns a
+            transformed version. The data object will be transformed before
+            every access. (default: :obj:`None`)
+        pre_transform (callable, optional): A function/transform that takes in
+            an :obj:`torch_geometric.data.HeteroData` object and returns a
+            transformed version. The data object will be transformed before
+            being saved to disk. (default: :obj:`None`)
+    """
+    url = ('https://www.kaggle.com/competitions/'
+           'h-and-m-personalized-fashion-recommendations/data')
+
+    def __init__(
+        self,
+        root: str,
+        use_all_tables_as_node_types: bool = False,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+    ):
+        self.use_all_tables_as_node_types = use_all_tables_as_node_types
+        super().__init__(root, transform, pre_transform)
+        self.load(self.processed_paths[0], data_cls=HeteroData)
+
+    @property
+    def raw_file_names(self) -> List[str]:
+        return [
+            'customers.csv.zip', 'articles.csv.zip',
+            'transactions_train.csv.zip'
+        ]
+
+    @property
+    def processed_file_names(self) -> str:
+        if self.use_all_tables_as_node_types:
+            return 'data.pt'
+        else:
+            return 'data_merged.pt'
+
+    def download(self):
+        raise RuntimeError(
+            f"Dataset not found. Please download {self.raw_file_names} from "
+            f"'{self.url}' and move it to '{self.raw_dir}'")
+
+    def process(self):
+        import pandas as pd
+
+        data = HeteroData()
+
+        # Process customer data ###############################################
+        df = pd.read_csv(self.raw_paths[0], index_col='customer_id')
+        customer_map = {idx: i for i, idx in enumerate(df.index)}
+
+        xs = []
+        for name in [
+                'Active', 'FN', 'club_member_status', 'fashion_news_frequency'
+        ]:
+            x = pd.get_dummies(df[name]).values
+            xs.append(torch.from_numpy(x).to(torch.float))
+
+        x = torch.from_numpy(df['age'].values).to(torch.float).view(-1, 1)
+        x = x.nan_to_num(nan=x.nanmean())
+        xs.append(x / x.max())
+
+        data['customer'].x = torch.cat(xs, dim=-1)
+
+        # Process article data ################################################
+        df = pd.read_csv(self.raw_paths[1], index_col='article_id')
+        article_map = {idx: i for i, idx in enumerate(df.index)}
+
+        xs = []
+        for name in [  # We drop a few columns here that are high cardinality.
+                # 'product_code',  # Drop.
+                # 'prod_name',  # Drop.
+                'product_type_no',
+                'product_type_name',
+                'product_group_name',
+                'graphical_appearance_no',
+                'graphical_appearance_name',
+                'colour_group_code',
+                'colour_group_name',
+                'perceived_colour_value_id',
+                'perceived_colour_value_name',
+                'perceived_colour_master_id',
+                'perceived_colour_master_name',
+                # 'department_no',  # Drop.
+                # 'department_name',  # Drop.
+                'index_code',
+                'index_name',
+                'index_group_no',
+                'index_group_name',
+                'section_no',
+                'section_name',
+                'garment_group_no',
+                'garment_group_name',
+                # 'detail_desc',  # Drop.
+        ]:
+            x = pd.get_dummies(df[name]).values
+            xs.append(torch.from_numpy(x).to(torch.float))
+
+        data['article'].x = torch.cat(xs, dim=-1)
+
+        # Process transaction data ############################################
+        df = pd.read_csv(self.raw_paths[2], parse_dates=['t_dat'])
+
+        x1 = pd.get_dummies(df['sales_channel_id']).values
+        x1 = torch.from_numpy(x1).to(torch.float)
+        x2 = torch.from_numpy(df['price'].values).to(torch.float).view(-1, 1)
+        x = torch.cat([x1, x2], dim=-1)
+
+        time = torch.from_numpy(df['t_dat'].values.astype(int))
+        time = time // (60 * 60 * 24 * 10**9)  # Convert nanoseconds to days.
+
+        src = torch.tensor([customer_map[idx] for idx in df['customer_id']])
+        dst = torch.tensor([article_map[idx] for idx in df['article_id']])
+
+        if self.use_all_tables_as_node_types:
+            data['transaction'].x = x
+            data['transaction'].time = time
+
+            edge_index = torch.stack([src, torch.arange(len(df))], dim=0)
+            data['customer', 'to', 'transaction'].edge_index = edge_index
+            edge_index = edge_index.flip([0])
+            data['transaction', 'rev_to', 'customer'].edge_index = edge_index
+
+            edge_index = torch.stack([dst, torch.arange(len(df))], dim=0)
+            data['article', 'to', 'transaction'].edge_index = edge_index
+            edge_index = edge_index.flip([0])
+            data['transaction', 'rev_to', 'article'].edge_index = edge_index
+        else:
+            edge_index = torch.stack([src, dst], dim=0)
+            data['customer', 'to', 'article'].edge_index = edge_index
+            data['customer', 'to', 'article'].time = time
+            data['customer', 'to', 'article'].edge_attr = x
+
+            edge_index = edge_index.flip([0])
+            data['article', 'rev_to', 'customer'].edge_index = edge_index
+            data['article', 'rev_to', 'customer'].time = time
+            data['article', 'rev_to', 'customer'].edge_attr = x
+
+        if self.pre_transform is not None:
+            data = self.pre_transform(data)
+
+        self.save([data], self.processed_paths[0])
@@ -33,12 +33,15 @@ class MovieLens(InMemoryDataset):
             features. The model comes from the`Huggingface SentenceTransformer
             <https://huggingface.co/sentence-transformers>`_.
     """
-
     url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
 
-    def __init__(self, root, transform: Optional[Callable] = None,
-                 pre_transform: Optional[Callable] = None,
-                 model_name: Optional[str] = "all-MiniLM-L6-v2"):
+    def __init__(
+        self,
+        root: str,
+        transform: Optional[Callable] = None,
+        pre_transform: Optional[Callable] = None,
+        model_name: Optional[str] = 'all-MiniLM-L6-v2',
+    ):
         self.model_name = model_name
         super().__init__(root, transform, pre_transform)
         self.data, self.slices = torch.load(self.processed_paths[0])