Hi there,
This my code to just keep training RoBerta on an additional english corpus:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()
from transformers import RobertaTokenizer, RobertaModel
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
#encoded_input = tokenizer(text, return_tensors='pt')
%%time
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
tokenizer=tokenizer,
file_path="./train/reports.txt",
block_size=128,
)
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir="./cyrobertapre",
overwrite_output_dir=True,
num_train_epochs=1,
per_device_train_batch_size=64,
save_steps=10_000,
save_total_limit=2,
prediction_loss_only=True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=dataset,
)
%%time
trainer.train()
I am getting a type error:
TypeError: RobertaModel.forward() got an unexpected keyword argument ‘labels’
I don’t know what it means.
Any help?