diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f22d25f8576..729d526812a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added the `HM` personalized fashion recommendation dataset ([#7515](https://github.com/pyg-team/pytorch_geometric/pull/7515)) - Added the `GraphMixer` model ([#7501](https://github.com/pyg-team/pytorch_geometric/pull/7501)) - Added the `disable_dynamic_shape` experimental flag ([#7246](https://github.com/pyg-team/pytorch_geometric/pull/7246)) - Added the option to override `use_segmm` selection in `HeteroLinear` ([#7474](https://github.com/pyg-team/pytorch_geometric/pull/7474)) diff --git a/torch_geometric/datasets/__init__.py b/torch_geometric/datasets/__init__.py index 1891f59af4ba..8734f6285b11 100644 --- a/torch_geometric/datasets/__init__.py +++ b/torch_geometric/datasets/__init__.py @@ -85,6 +85,7 @@ from .taobao import Taobao from .igmc_dataset import IGMCDataset from .amazon_book import AmazonBook +from .hm import HM from .fake import FakeDataset, FakeHeteroDataset from .sbm_dataset import StochasticBlockModelDataset @@ -188,6 +189,7 @@ 'Taobao', 'IGMCDataset', 'AmazonBook', + 'HM', ] synthetic_datasets = [ 'FakeDataset', diff --git a/torch_geometric/datasets/hm.py b/torch_geometric/datasets/hm.py new file mode 100644 index 000000000000..daf6f4bde6a1 --- /dev/null +++ b/torch_geometric/datasets/hm.py @@ -0,0 +1,162 @@ +from typing import Callable, List, Optional + +import torch + +from torch_geometric.data import HeteroData, InMemoryDataset + + +class HM(InMemoryDataset): + r"""The heterogeneous H&M dataset from the `Kaggle H&M Personalized Fashion + Recommendations + `_ + challenge. + The task is to develop product recommendations based on data from previous + transactions, as well as from customer and product meta data. + + Args: + root (str): Root directory where the dataset should be saved. + use_all_tables_as_node_types (bool, optional): If set to :obj:`True`, + will use the transaction table as a distinct node type. + (default: :obj:`False`) + transform (callable, optional): A function/transform that takes in an + :obj:`torch_geometric.data.HeteroData` object and returns a + transformed version. The data object will be transformed before + every access. (default: :obj:`None`) + pre_transform (callable, optional): A function/transform that takes in + an :obj:`torch_geometric.data.HeteroData` object and returns a + transformed version. The data object will be transformed before + being saved to disk. (default: :obj:`None`) + """ + url = ('https://www.kaggle.com/competitions/' + 'h-and-m-personalized-fashion-recommendations/data') + + def __init__( + self, + root: str, + use_all_tables_as_node_types: bool = False, + transform: Optional[Callable] = None, + pre_transform: Optional[Callable] = None, + ): + self.use_all_tables_as_node_types = use_all_tables_as_node_types + super().__init__(root, transform, pre_transform) + self.load(self.processed_paths[0], data_cls=HeteroData) + + @property + def raw_file_names(self) -> List[str]: + return [ + 'customers.csv.zip', 'articles.csv.zip', + 'transactions_train.csv.zip' + ] + + @property + def processed_file_names(self) -> str: + if self.use_all_tables_as_node_types: + return 'data.pt' + else: + return 'data_merged.pt' + + def download(self): + raise RuntimeError( + f"Dataset not found. Please download {self.raw_file_names} from " + f"'{self.url}' and move it to '{self.raw_dir}'") + + def process(self): + import pandas as pd + + data = HeteroData() + + # Process customer data ############################################### + df = pd.read_csv(self.raw_paths[0], index_col='customer_id') + customer_map = {idx: i for i, idx in enumerate(df.index)} + + xs = [] + for name in [ + 'Active', 'FN', 'club_member_status', 'fashion_news_frequency' + ]: + x = pd.get_dummies(df[name]).values + xs.append(torch.from_numpy(x).to(torch.float)) + + x = torch.from_numpy(df['age'].values).to(torch.float).view(-1, 1) + x = x.nan_to_num(nan=x.nanmean()) + xs.append(x / x.max()) + + data['customer'].x = torch.cat(xs, dim=-1) + + # Process article data ################################################ + df = pd.read_csv(self.raw_paths[1], index_col='article_id') + article_map = {idx: i for i, idx in enumerate(df.index)} + + xs = [] + for name in [ # We drop a few columns here that are high cardinality. + # 'product_code', # Drop. + # 'prod_name', # Drop. + 'product_type_no', + 'product_type_name', + 'product_group_name', + 'graphical_appearance_no', + 'graphical_appearance_name', + 'colour_group_code', + 'colour_group_name', + 'perceived_colour_value_id', + 'perceived_colour_value_name', + 'perceived_colour_master_id', + 'perceived_colour_master_name', + # 'department_no', # Drop. + # 'department_name', # Drop. + 'index_code', + 'index_name', + 'index_group_no', + 'index_group_name', + 'section_no', + 'section_name', + 'garment_group_no', + 'garment_group_name', + # 'detail_desc', # Drop. + ]: + x = pd.get_dummies(df[name]).values + xs.append(torch.from_numpy(x).to(torch.float)) + + data['article'].x = torch.cat(xs, dim=-1) + + # Process transaction data ############################################ + df = pd.read_csv(self.raw_paths[2], parse_dates=['t_dat']) + + x1 = pd.get_dummies(df['sales_channel_id']).values + x1 = torch.from_numpy(x1).to(torch.float) + x2 = torch.from_numpy(df['price'].values).to(torch.float).view(-1, 1) + x = torch.cat([x1, x2], dim=-1) + + time = torch.from_numpy(df['t_dat'].values.astype(int)) + time = time // (60 * 60 * 24 * 10**9) # Convert nanoseconds to days. + + src = torch.tensor([customer_map[idx] for idx in df['customer_id']]) + dst = torch.tensor([article_map[idx] for idx in df['article_id']]) + + if self.use_all_tables_as_node_types: + data['transaction'].x = x + data['transaction'].time = time + + edge_index = torch.stack([src, torch.arange(len(df))], dim=0) + data['customer', 'to', 'transaction'].edge_index = edge_index + edge_index = edge_index.flip([0]) + data['transaction', 'rev_to', 'customer'].edge_index = edge_index + + edge_index = torch.stack([dst, torch.arange(len(df))], dim=0) + data['article', 'to', 'transaction'].edge_index = edge_index + edge_index = edge_index.flip([0]) + data['transaction', 'rev_to', 'article'].edge_index = edge_index + else: + edge_index = torch.stack([src, dst], dim=0) + data['customer', 'to', 'article'].edge_index = edge_index + data['customer', 'to', 'article'].time = time + data['customer', 'to', 'article'].edge_attr = x + + edge_index = edge_index.flip([0]) + data['article', 'rev_to', 'customer'].edge_index = edge_index + data['article', 'rev_to', 'customer'].time = time + data['article', 'rev_to', 'customer'].edge_attr = x + + if self.pre_transform is not None: + data = self.pre_transform(data) + + self.save([data], self.processed_paths[0]) diff --git a/torch_geometric/datasets/movie_lens.py b/torch_geometric/datasets/movie_lens.py index 2be4a414a293..bc872625bbe1 100644 --- a/torch_geometric/datasets/movie_lens.py +++ b/torch_geometric/datasets/movie_lens.py @@ -33,12 +33,15 @@ class MovieLens(InMemoryDataset): features. The model comes from the`Huggingface SentenceTransformer `_. """ - url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip' - def __init__(self, root, transform: Optional[Callable] = None, - pre_transform: Optional[Callable] = None, - model_name: Optional[str] = "all-MiniLM-L6-v2"): + def __init__( + self, + root: str, + transform: Optional[Callable] = None, + pre_transform: Optional[Callable] = None, + model_name: Optional[str] = 'all-MiniLM-L6-v2', + ): self.model_name = model_name super().__init__(root, transform, pre_transform) self.data, self.slices = torch.load(self.processed_paths[0])