From 4a7cd5825717b0b0c96a6463e0dd2d7b18dd4331 Mon Sep 17 00:00:00 2001 From: alexeib Date: Fri, 30 Aug 2019 16:23:40 -0700 Subject: [PATCH] set numpy seed explicitly + other minor fixes (#850) Summary: not setting the numpy seed explicitly at the beginning was an extremely annoying bug to find. it it caused different gpus to have a different view of data if some randomization was used in the dataset (e.g. subsample dataset) Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/850 Differential Revision: D17085006 Pulled By: alexeib fbshipit-source-id: 62bb2116369fb703df878e6bc24c06f1ea4e75a0 --- fairseq/data/replace_dataset.py | 24 +++++++++++++++++------- fairseq/data/subsample_dataset.py | 13 +++++++++++-- train.py | 2 ++ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/fairseq/data/replace_dataset.py b/fairseq/data/replace_dataset.py index 670b812f45..3bc52f0fb5 100644 --- a/fairseq/data/replace_dataset.py +++ b/fairseq/data/replace_dataset.py @@ -7,20 +7,30 @@ class ReplaceDataset(BaseWrapperDataset): - def __init__(self, dataset, replace_map, offset=0): + """Replaces tokens found in the dataset by a specified replacement token + + Args: + dataset (~torch.utils.data.Dataset): dataset to replace tokens in + replace_map(Dictionary[int,int]): map of token to replace -> replacement token + offsets (List[int]): do not replace tokens before (from left if pos, right if neg) this offset. should be + as many as the number of objects returned by the underlying dataset __getitem__ method. + """ + + def __init__(self, dataset, replace_map, offsets): super().__init__(dataset) assert len(replace_map) > 0 self.replace_map = replace_map - self.offset = offset + self.offsets = offsets def __getitem__(self, index): item = self.dataset[index] is_tuple = isinstance(item, tuple) - src = item[0] if is_tuple else item + srcs = item if is_tuple else [item] - for k, v in self.replace_map.items(): - src_off = src[self.offset:] - src_off.masked_fill_(src_off == k, v) + for offset, src in zip(self.offsets, srcs): + for k, v in self.replace_map.items(): + src_off = src[offset:] if offset >= 0 else src[:offset] + src_off.masked_fill_(src_off == k, v) - item = tuple((src,) + item[1:]) if is_tuple else src + item = srcs if is_tuple else srcs[0] return item diff --git a/fairseq/data/subsample_dataset.py b/fairseq/data/subsample_dataset.py index 983a611393..f1c2942e52 100644 --- a/fairseq/data/subsample_dataset.py +++ b/fairseq/data/subsample_dataset.py @@ -9,15 +9,24 @@ class SubsampleDataset(BaseWrapperDataset): + """Subsamples a given dataset by a specified ratio. Subsampling is done on the number of examples + + Args: + dataset (~torch.utils.data.Dataset): dataset to subsample + size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive) + """ + def __init__(self, dataset, size_ratio): super().__init__(dataset) assert size_ratio < 1 self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int) self.indices = np.random.choice( - range(len(self.dataset)), self.actual_size, replace=False + list(range(len(self.dataset))), self.actual_size, replace=False ) print( - "subsampled dataset from {} to {} (ratio={})".format(len(self.dataset), self.actual_size, size_ratio) + "subsampled dataset from {} to {} (ratio={})".format( + len(self.dataset), self.actual_size, size_ratio + ) ) def __getitem__(self, index): diff --git a/train.py b/train.py index afe9c10232..e4f0f7a5d2 100644 --- a/train.py +++ b/train.py @@ -9,6 +9,7 @@ import collections import math +import numpy as np import random import torch @@ -28,6 +29,7 @@ def main(args, init_distributed=False): # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) + np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args)