Skip to content

Commit

Permalink
Support tokenizers that do not have a pad_token configured
Browse files Browse the repository at this point in the history
- closes #37
  • Loading branch information
krasserm committed Feb 24, 2023
1 parent 1847909 commit f807b2e
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 3 deletions.
2 changes: 1 addition & 1 deletion perceiver/data/text/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _prepare_sequence(self, sequence, max_length):
sequence,
add_special_tokens=False,
return_token_type_ids=False,
padding=PaddingStrategy.MAX_LENGTH,
padding=False if self.tokenizer.pad_token is None else PaddingStrategy.MAX_LENGTH,
max_length=max_length,
truncation=True,
)
Expand Down
3 changes: 2 additions & 1 deletion perceiver/data/text/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def preprocess(self, text):
def preprocess_batch(self, text_batch):
result = self.tokenizer(
text_batch,
padding=True,
padding=self.tokenizer.pad_token is not None,
truncation=True,
add_special_tokens=self.add_special_tokens,
return_token_type_ids=False,
Expand Down Expand Up @@ -261,6 +261,7 @@ def _tokenize_dataset(
return_word_ids=True,
):
def tokenize(examples):
examples["text"] = [text + self.tokenizer.eos_token for text in examples["text"]]
encoding = self.tokenizer(
examples["text"],
padding=padding,
Expand Down
2 changes: 1 addition & 1 deletion perceiver/model/text/clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def create(cls, config: CausalLanguageModelConfig, **kwargs: Any):
def setup(self, stage: Optional[str] = None):
dm = self.trainer.datamodule

if dm.tokenizer.padding_side != "left":
if dm.tokenizer.pad_token is not None and dm.tokenizer.padding_side != "left":
raise ValueError(
"Causal language modeling with Perceiver AR requires a data module configured with padding_side=left"
)
Expand Down

0 comments on commit f807b2e

Please sign in to comment.