We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
bert was written so I tried to convert it to bert, but the conversion did not work. How should I do this?
[Full Code]
import torch from torch.utils.data import DataLoader from retro_pytorch import RETRO, RETRODataset
import numpy as np
NUM_CHUNKS = 1000 CHUNK_SIZE = 64 NUM_SEQS = 100 NUM_NEIGHBORS = 2
def save_memmap(path, tensor): f = np.memmap(path, dtype=tensor.dtype, mode='w+', shape=tensor.shape) f[:] = tensor del f
save_memmap( './train.chunks.dat', np.int32(np.random.randint(0, 8192, size=(NUM_CHUNKS, CHUNK_SIZE + 1))) )
save_memmap( './train.chunks.knn.dat', np.int32(np.random.randint(0, 1000, size=(NUM_CHUNKS, NUM_NEIGHBORS))) )
save_memmap( './train.seq.dat', np.int32(np.random.randint(0, 128, size=(NUM_SEQS,))) )
train_ds = RETRODataset( num_sequences=NUM_SEQS, num_chunks=NUM_CHUNKS, num_neighbors=NUM_NEIGHBORS, chunk_size=CHUNK_SIZE, seq_len=2048, chunk_memmap_path='./train.chunks.dat', chunk_nn_memmap_path='./train.chunks.knn.dat', seq_memmap_path='./train.seq.dat' )
batch_size = 1 # or any smaller value
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
retro = RETRO( max_seq_len=2048, enc_dim=896, enc_depth=3, dec_dim=768, dec_depth=12, dec_cross_attn_layers=(1, 3, 6, 9), heads=8, dim_head=64, dec_attn_dropout=0.25, dec_ff_dropout=0.25 ).cuda()
for i, batch in enumerate(train_dl): # Move data to GPU seq, retrieved = map(lambda t: t.cuda(), batch)
# Forward pass loss = retro(seq, retrieved, return_loss=True) # Backward pass loss.backward()
from retro_pytorch.retrieval import bert_embed, tokenize
input_texts = ['hello world', 'foo bar'] input_ids = tokenize(input_texts).cuda()
embeds = bert_embed(input_ids, return_cls_repr=True) # (2, 768)
print("Bert Embeddings:", embeds)
from transformers import BertTokenizer
tokenizer2 = BertTokenizer.from_pretrained('LilaBoualili/bert-vanilla')
bert_embeddings = embeds
print(tokenizer2.decode(bert_embeddings[0], skip_special_tokens=True))
The text was updated successfully, but these errors were encountered:
No branches or pull requests
bert was written so I tried to convert it to bert, but the conversion did not work. How should I do this?
[Full Code]
import torch
from torch.utils.data import DataLoader
from retro_pytorch import RETRO, RETRODataset
mock data constants
import numpy as np
NUM_CHUNKS = 1000
CHUNK_SIZE = 64
NUM_SEQS = 100
NUM_NEIGHBORS = 2
def save_memmap(path, tensor):
f = np.memmap(path, dtype=tensor.dtype, mode='w+', shape=tensor.shape)
f[:] = tensor
del f
generate mock chunk data
save_memmap(
'./train.chunks.dat',
np.int32(np.random.randint(0, 8192, size=(NUM_CHUNKS, CHUNK_SIZE + 1)))
)
generate nearest neighbors for each chunk
save_memmap(
'./train.chunks.knn.dat',
np.int32(np.random.randint(0, 1000, size=(NUM_CHUNKS, NUM_NEIGHBORS)))
)
generate seq data
save_memmap(
'./train.seq.dat',
np.int32(np.random.randint(0, 128, size=(NUM_SEQS,)))
)
instantiate dataset class
which constructs the sequence and neighbors from memmapped chunk and neighbor information
train_ds = RETRODataset(
num_sequences=NUM_SEQS,
num_chunks=NUM_CHUNKS,
num_neighbors=NUM_NEIGHBORS,
chunk_size=CHUNK_SIZE,
seq_len=2048,
chunk_memmap_path='./train.chunks.dat',
chunk_nn_memmap_path='./train.chunks.knn.dat',
seq_memmap_path='./train.seq.dat'
)
Use a smaller batch size to avoid out-of-memory issues
batch_size = 1 # or any smaller value
Create a DataLoader with the specified batch size
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
Instantiate RETRO model
retro = RETRO(
max_seq_len=2048,
enc_dim=896,
enc_depth=3,
dec_dim=768,
dec_depth=12,
dec_cross_attn_layers=(1, 3, 6, 9),
heads=8,
dim_head=64,
dec_attn_dropout=0.25,
dec_ff_dropout=0.25
).cuda()
for i, batch in enumerate(train_dl):
# Move data to GPU
seq, retrieved = map(lambda t: t.cuda(), batch)
from retro_pytorch.retrieval import bert_embed, tokenize
Tokenize input text
input_texts = ['hello world', 'foo bar']
input_ids = tokenize(input_texts).cuda()
Compute BERT embeddings on the GPU
embeds = bert_embed(input_ids, return_cls_repr=True) # (2, 768)
Print or use the generated sequence (convert back to text if necessary)
print("Bert Embeddings:", embeds)
from transformers import BertTokenizer
Load the BERT tokenizer
tokenizer2 = BertTokenizer.from_pretrained('LilaBoualili/bert-vanilla')
Example BERT embeddings
bert_embeddings = embeds
Decoding Text
print(tokenizer2.decode(bert_embeddings[0], skip_special_tokens=True))
The text was updated successfully, but these errors were encountered: