I have pretrained two tokenizers. One has a vocabulary size of 15000 and the other is 30000. I use the same corpus and code except for the vocab_size parameter.
from tokenizers import ByteLevelBPETokenizer
tokenizer.train(files = ["samecorpus.txt"], vocab_size=..., min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ])
!mkdir folder
tokenizer.save_model("folder")
I intend to pre-train a RoBERTa from scratch using the code from Huggingface’s tutorial with the following modification.
Firstly, the configuration reflects the tokenizer size.
from transformers import RobertaConfig
config = RobertaConfig(
vocab_size=...,
max_position_embeddings=514,
num_attention_heads=12,
num_hidden_layers=6,
type_vocab_size=1,
)
Second, I used a custom LineByLineTextDataset. Essentially the same thing with huggingface’s implementation, but BatchedFile read the file lazily due to file size.
class BatchedFile():
def __init__(self, tokenizer):
self.file = open("largecorpus.txt", encoding="utf-8")
self.COUNT = 51476181
self.tokenizer = tokenizer
self.result = []
def get(self):
if len(self.result) == 0:
self.spawn()
return self.result.pop(0)
def spawn(self):
sentences = []
while len(sentences) < 10_000 and self.COUNT > 0:
teks = self.file.readline().strip()
if (len(teks) > 0 and not teks.isspace()):
sentences.append(teks)
self.COUNT -= 1
if self.COUNT == 0:
self.COUNT = 51476181
self.file.close()
self.file = open("largecorpus.txt", encoding="utf-8")
batch_encoding = self.tokenizer(sentences, add_special_tokens=True, truncation=True, max_length=128)
self.result = batch_encoding["input_ids"]
class LineByLineTextDataset(Dataset):
"""
This will be superseded by a framework-agnostic approach
soon.
"""
def __init__(self, tokenizer):
self.items = BatchedFile(tokenizer)
def __len__(self):
return 51476181
def __getitem__(self, i) -> torch.Tensor:
return torch.tensor(self.items.get(), dtype=torch.long)
I successfully ran this code with the 30 thousand words tokenizer. However, the code doesn’t work with the 15 thousand word tokenizer, no matter the size of the vocab_size of the configuration.
Attempting to run the code with CPU resulted this
/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
1722 # remove once script supports set_grad_enabled
1723 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1724 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
1725
1726
IndexError: index out of range in self
Attempting to run the code with GPU resulted this
/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py in linear(input, weight, bias)
1610 ret = torch.addmm(bias, input, weight.t())
1611 else:
-> 1612 output = input.matmul(weight.t())
1613 if bias is not None:
1614 output += bias
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`
I’m wondering where I went wrong.