diff --git a/fairseq/data/transforms/__init__.py b/fairseq/data/encoders/__init__.py similarity index 83% rename from fairseq/data/transforms/__init__.py rename to fairseq/data/encoders/__init__.py index a64954edd6..1e7e69fbea 100644 --- a/fairseq/data/transforms/__init__.py +++ b/fairseq/data/encoders/__init__.py @@ -24,8 +24,8 @@ ) -# automatically import any Python files in the transforms/ directory +# automatically import any Python files in the encoders/ directory for file in os.listdir(os.path.dirname(__file__)): if file.endswith('.py') and not file.startswith('_'): module = file[:file.find('.py')] - importlib.import_module('fairseq.data.transforms.' + module) + importlib.import_module('fairseq.data.encoders.' + module) diff --git a/fairseq/data/transforms/gpt2_bpe.py b/fairseq/data/encoders/gpt2_bpe.py similarity index 99% rename from fairseq/data/transforms/gpt2_bpe.py rename to fairseq/data/encoders/gpt2_bpe.py index 38361a49ac..b411a5aa39 100644 --- a/fairseq/data/transforms/gpt2_bpe.py +++ b/fairseq/data/encoders/gpt2_bpe.py @@ -6,7 +6,7 @@ # can be found in the PATENTS file in the same directory. from fairseq import file_utils -from fairseq.data.transforms import register_bpe +from fairseq.data.encoders import register_bpe @register_bpe('gpt2') diff --git a/fairseq/data/transforms/moses_tokenizer.py b/fairseq/data/encoders/moses_tokenizer.py similarity index 96% rename from fairseq/data/transforms/moses_tokenizer.py rename to fairseq/data/encoders/moses_tokenizer.py index dc6016d914..4964a822c2 100644 --- a/fairseq/data/transforms/moses_tokenizer.py +++ b/fairseq/data/encoders/moses_tokenizer.py @@ -5,7 +5,7 @@ # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. -from fairseq.data.transforms import register_tokenizer +from fairseq.data.encoders import register_tokenizer @register_tokenizer('moses') diff --git a/fairseq/data/transforms/nltk_tokenizer.py b/fairseq/data/encoders/nltk_tokenizer.py similarity index 93% rename from fairseq/data/transforms/nltk_tokenizer.py rename to fairseq/data/encoders/nltk_tokenizer.py index 206243c7fb..61325efc42 100644 --- a/fairseq/data/transforms/nltk_tokenizer.py +++ b/fairseq/data/encoders/nltk_tokenizer.py @@ -5,7 +5,7 @@ # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. -from fairseq.data.transforms import register_tokenizer +from fairseq.data.encoders import register_tokenizer @register_tokenizer('nltk') diff --git a/fairseq/data/transforms/sentencepiece_bpe.py b/fairseq/data/encoders/sentencepiece_bpe.py similarity index 95% rename from fairseq/data/transforms/sentencepiece_bpe.py rename to fairseq/data/encoders/sentencepiece_bpe.py index 11b8dfe20c..9b27460194 100644 --- a/fairseq/data/transforms/sentencepiece_bpe.py +++ b/fairseq/data/encoders/sentencepiece_bpe.py @@ -6,7 +6,7 @@ # can be found in the PATENTS file in the same directory. from fairseq import file_utils -from fairseq.data.transforms import register_bpe +from fairseq.data.encoders import register_bpe @register_bpe('sentencepiece') diff --git a/fairseq/data/transforms/space_tokenizer.py b/fairseq/data/encoders/space_tokenizer.py similarity index 91% rename from fairseq/data/transforms/space_tokenizer.py rename to fairseq/data/encoders/space_tokenizer.py index 95d68a45d6..b804b969d8 100644 --- a/fairseq/data/transforms/space_tokenizer.py +++ b/fairseq/data/encoders/space_tokenizer.py @@ -7,7 +7,7 @@ import re -from fairseq.data.transforms import register_tokenizer +from fairseq.data.encoders import register_tokenizer @register_tokenizer('space') diff --git a/fairseq/data/transforms/subword_nmt_bpe.py b/fairseq/data/encoders/subword_nmt_bpe.py similarity index 97% rename from fairseq/data/transforms/subword_nmt_bpe.py rename to fairseq/data/encoders/subword_nmt_bpe.py index c5f2722340..b2c1fa33b9 100644 --- a/fairseq/data/transforms/subword_nmt_bpe.py +++ b/fairseq/data/encoders/subword_nmt_bpe.py @@ -6,7 +6,7 @@ # can be found in the PATENTS file in the same directory. from fairseq import file_utils -from fairseq.data.transforms import register_bpe +from fairseq.data.encoders import register_bpe @register_bpe('subword_nmt') diff --git a/fairseq/hub_utils.py b/fairseq/hub_utils.py index 1218b9da7c..02c3291fda 100644 --- a/fairseq/hub_utils.py +++ b/fairseq/hub_utils.py @@ -9,7 +9,7 @@ import torch from fairseq import utils -from fairseq.data import transforms +from fairseq.data import encoders class Generator(object): @@ -44,8 +44,8 @@ def __init__(self, args, task, models): # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(getattr(args, 'replace_unk', None)) - self.tokenizer = transforms.build_tokenizer(args) - self.bpe = transforms.build_bpe(args) + self.tokenizer = encoders.build_tokenizer(args) + self.bpe = encoders.build_bpe(args) def generate(self, src_str, verbose=False): diff --git a/interactive.py b/interactive.py index eea7fcbda4..632a16f3ed 100644 --- a/interactive.py +++ b/interactive.py @@ -15,7 +15,7 @@ import torch from fairseq import checkpoint_utils, options, tasks, utils -from fairseq.data import transforms +from fairseq.data import encoders Batch = namedtuple('Batch', 'ids src_tokens src_lengths') @@ -103,8 +103,8 @@ def main(args): generator = task.build_generator(args) # Handle tokenization and BPE - tokenizer = transforms.build_tokenizer(args) - bpe = transforms.build_bpe(args) + tokenizer = encoders.build_tokenizer(args) + bpe = encoders.build_bpe(args) def encode_fn(x): if tokenizer is not None: