class CustomDataset(Dataset):
def init(self, evaluate: bool = False):
self.examples = []
self.sample_size=10 # change it to something meaningful later
self.src_files = dpy
for i in range(self.sample_size):
sentences=dpy['train']['whole_func_string'][i].split('\n')
self.examples+=[t.ids for t in tokenizer.encode_batch(sentences)]
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
# We’ll pad at the batch level.
return torch.tensor(self.examples[i], dtype=torch.int64)
d=CustromDataset()
Define the Data Collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=fast_tokenizer, mlm=True, mlm_probability=0.15, return_tensors=‘pt’
)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir=“./bert”,
overwrite_output_dir=True,
num_train_epochs=100,
do_train=True,
per_gpu_train_batch_size=32,
save_steps=500,
save_total_limit=2
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=d,
eval_dataset=d,
data_collator=data_collator
)
Question: My question is here train_dataset is d and also eval_dataset is d, Does the data_collator mask the train_dataset and train against original eval_dataset d? Or, Do, I need to provide completely new dataset here?
what is the role the data_collator playing here in the training, what does it exactly do?