Skip to content

Commit

Permalink
Better tokenizing code for AuraFlow.
Browse files Browse the repository at this point in the history
  • Loading branch information
comfyanonymous committed Jul 12, 2024
1 parent b6f09cf commit 29c2e26
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 1,175 deletions.
6 changes: 3 additions & 3 deletions comfy/text_encoders/aura_t5.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from comfy import sd1_clip
from transformers import LlamaTokenizerFast
from .llama_tokenizer import LLAMATokenizer
import comfy.t5
import os

Expand All @@ -10,8 +10,8 @@ def __init__(self, device="cpu", layer="last", layer_idx=None, dtype=None):

class PT5XlTokenizer(sd1_clip.SDTokenizer):
def __init__(self, embedding_directory=None):
tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_tokenizer")
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=LlamaTokenizerFast, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)
tokenizer_path = os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "t5_pile_tokenizer"), "tokenizer.model")
super().__init__(tokenizer_path, pad_with_end=False, embedding_size=2048, embedding_key='pile_t5xl', tokenizer_class=LLAMATokenizer, has_start_token=False, pad_to_max_length=False, max_length=99999999, min_length=256, pad_token=1)

class AuraT5Tokenizer(sd1_clip.SD1Tokenizer):
def __init__(self, embedding_directory=None):
Expand Down
22 changes: 22 additions & 0 deletions comfy/text_encoders/llama_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os

class LLAMATokenizer:
@staticmethod
def from_pretrained(path):
return LLAMATokenizer(path)

def __init__(self, tokenizer_path):
import sentencepiece
self.tokenizer = sentencepiece.SentencePieceProcessor(model_file=tokenizer_path)
self.end = self.tokenizer.eos_id()

def get_vocab(self):
out = {}
for i in range(self.tokenizer.get_piece_size()):
out[self.tokenizer.id_to_piece(i)] = i
return out

def __call__(self, string):
out = self.tokenizer.encode(string)
out += [self.end]
return {"input_ids": out}
102 changes: 0 additions & 102 deletions comfy/text_encoders/t5_pile_tokenizer/added_tokens.json

This file was deleted.

125 changes: 0 additions & 125 deletions comfy/text_encoders/t5_pile_tokenizer/special_tokens_map.json

This file was deleted.

Loading

0 comments on commit 29c2e26

Please sign in to comment.