I am fine tuning masked language model from XLM Roberta large on google machine specs. I have extended vocabulary by adding extra tokens.
I am using pre-trained Hugging face model.
I launch it as train.py file which I copy inside docker image and use vertex-ai ( GCP) to launch it using Containerspec
machineSpec = MachineSpec(machine_type=“a2-highgpu-4g”,accelerator_count=4,accelerator_type=“NVIDIA_TESLA_A100”)
python -m torch.distributed.launch --nproc_per_node 4 train.py --gradient_accumulation_steps 16 --per_device_train_batch_size 4 --optim adamw_hf --tf32 --bf16"])
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
Traceback (most recent call last):\n', ' File "train.py", line 215, in <module>\n trainer.train()\n', ' File "/opt/conda/lib/python3.7/site-packages/transformers/trainer.py", line 1258, in train\n model = self._wrap_model(self.model_wrapped)\n', ' File "/opt/conda/lib/python3.7/site-packages/transformers/trainer.py", line 1088, in _wrap_model\n **kwargs,\n', ' File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 641, in __init__\n dist._verify_params_across_processes(self.process_group, parameters)\n', 'RuntimeError: params[0] in this process with sizes [253991, 1024] appears not to match sizes of the same param in process 0.\n'
torch==1.11.0+cu113
torchvision==0.12.0+cu113
torchaudio==0.11.0+cu113
transformers==4.17.0
Using GPU in script?: Yes
Using distributed or parallel set-up in script?: Yes
Who can help
Models:
- Roberta-xlm-large
Library:
Information
Model I am using (Bert, XLNet …):
The problem arises when using:
- [ ] my own modified scripts: (give details below)
The tasks I am working on is:
- [ ] my own task or dataset: (give details below)
To reproduce
Steps to reproduce the behavior:
tokenizer = tr.XLMRobertaTokenizer.from_pretrained("xlm-roberta-large",local_files_only=True)
tokenizer.add_tokens(joined_keywords)
tokenizer_org = tr.XLMRobertaTokenizer.from_pretrained("xlm-roberta-large",local_files_only=True)
model = tr.XLMRobertaForMaskedLM.from_pretrained("xlm-roberta-large", return_dict=True,local_files_only=True)
# add embedding params for new vocab words
model.resize_token_embeddings(len(tokenizer))
weights = model.roberta.embeddings.word_embeddings.weight
# initialize new embedding weights as mean of original tokens
with torch.no_grad():
emb = []
for i in range(len(joined_keywords)):
word = joined_keywords[i]
# first & last tokens are just string start/end; don't keep
tok_ids = tokenizer_org(word)["input_ids"][1:-1]
tok_weights = weights[tok_ids]
# average over tokens in original tokenization
weight_mean = torch.mean(tok_weights, axis=0)
emb.append(weight_mean)
weights[-len(joined_keywords):,:] = torch.vstack(emb).requires_grad_()
# tokenizer.convert_ids_to_tokens(encoded_input["input_ids"][0])
tokenizer_out_files = tokenizer.save_pretrained("tokenizer_xlm")
model.to(device)
train_encodings = tokenizer(train_df, truncation=True, padding=True, max_length=512, return_tensors="pt")
class SEDataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
return item
def __len__(self):
return len(self.encodings["attention_mask"])
train_data = SEDataset(train_encodings)
# print("train data created")
training_args = tr.TrainingArguments(
output_dir='results_mlm_vocab_exp'
,logging_dir='logs_mlm_vocab_exp' # directory for storing logs
,save_strategy="epoch"
# ,run_name="MLM_Exp1"
,learning_rate=2e-5
,logging_steps=2000
,overwrite_output_dir=True
,num_train_epochs=20
,per_device_train_batch_size=4
,prediction_loss_only=True
,gradient_accumulation_steps=16
# ,sharded_ddp='zero_dp_3'
# ,gradient_checkpointing=True
,bf16=True #Ampere GPU
# ,fp16=True
,optim="adamw_hf"
# ,dataloader_num_workers=20
# ,logging_strategy='no'
# per_device_train_batch_size
# per_gpu_train_batch_size
# disable_tqdm=True
)
# print("training sample is 200001")
# print("Included ,gradient_accumulation_steps=8 ,bf16=True and per_device_train_batch_size=16 " )
print("start time",start)
trainer = tr.Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_data
)
# print("training to start without bf16")
trainer.train()