Context
I am trying to fine-tune the RobertaModel
for the question-answering task. I built my custom QA model without using the class RobertaModelForQuestionAnswering
.
I prepare the SQuAD datasets like:
def compute_train_features(examples, tokenizer: PreTrainedTokenizer, max_seq_len: int,
doc_stride: int, is_squad_v2: bool = False):
# 1. Tokenize the examples
# we want [BOS] question [SEP] context [EOS]
pad_on_right = tokenizer.padding_side == "right"
examples['question'] = [q.lstrip() for q in examples['question']]
# tokenized_examples <=> features of chunks
tokenized_examples = tokenizer(
examples["question" if pad_on_right else "context"],
examples["context" if pad_on_right else "question"],
truncation="only_second" if pad_on_right else "only_first",
max_length=max_seq_len,
stride=doc_stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
# long document might be split into several chunks, `sample_mapping` is the mapping between original
# sample index and its chunks.
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# offset_mapping will give us a map from token to character position in the original context
offset_mapping = tokenized_examples.pop("offset_mapping")
# 2. compute the start_positions and end_positions
tokenized_examples["start_positions"] = []
tokenized_examples["end_positions"] = []
for chunk_idx, offsets in enumerate(offset_mapping):
input_ids = tokenized_examples["input_ids"][chunk_idx]
# beginning of sequence token index
bos_token_id = tokenizer.bos_token_id
# sequence ids (0 for question and 1 for context) if padding_side is right
# sequence ids (1 for question and 0 for context) if padding_side is left
sequence_ids = tokenized_examples.sequence_ids(chunk_idx)
# sample_idx <=> which example this chunk is from
sample_idx = sample_mapping[chunk_idx]
answers = examples["answers"][sample_idx]
# when there is no answer provided (in SQuADv2)
if len(answers["answer_start"]) == 0:
# we set BOS/CLS as the label for start and end positions
tokenized_examples["start_positions"].append(bos_token_id)
tokenized_examples["end_positions"].append(bos_token_id)
else:
# TODO: currently only use the first answer in the list, should handle all answers
# character-level positions
ans_start_char = answers["answer_start"][0]
ans_end_char = ans_start_char + len(answers["text"][0])
# token level positions
token_start_index = 0
token_end_index = len(input_ids) - 1
# skip the question tokens and special tokens [BOS], [SEP]
while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
token_start_index += 1
# skip the [PAD] and [EOS] tokens at the end
while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
token_end_index -= 1
# now find the exact positions of the start, end tokens
if offsets[token_start_index][0] <= ans_start_char and offsets[token_end_index][1] >= ans_end_char:
# Iterate the tokens in the sequence to find two ends of the answer
while token_start_index < len(offsets) and offsets[token_start_index][0] <= ans_start_char:
token_start_index += 1
while offsets[token_end_index][1] >= ans_end_char:
token_end_index -= 1
start_position = token_start_index - 1
end_position = token_end_index + 1
# Edge Case: the answer is the last word of the context
if start_position == len(sequence_ids) - 1:
start_position = end_position
tokenized_examples["start_positions"].append(start_position)
tokenized_examples["end_positions"].append(end_position)
else:
# the answer is not in this chunk
tokenized_examples["start_positions"].append(bos_token_id)
tokenized_examples["end_positions"].append(bos_token_id)
return tokenized_examples
def compute_evaluation_features(examples,
tokenizer: PreTrainedTokenizer,
max_seq_len: int = 384,
doc_stride: int = 128):
pad_on_right = tokenizer.padding_side == "right"
tokenized_examples = tokenizer(
examples["question" if pad_on_right else "context"],
examples["context" if pad_on_right else "question"],
truncation="only_second" if pad_on_right else "only_first",
max_length=max_seq_len,
stride=doc_stride,
return_overflowing_tokens=True,
return_offsets_mapping=True,
padding="max_length",
)
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
tokenized_examples["example_id"] = []
for i in range(len(tokenized_examples["input_ids"])):
sequence_ids = tokenized_examples.sequence_ids(i)
context_index = 1 if pad_on_right else 0
sample_index = sample_mapping[i]
tokenized_examples["example_id"].append(examples["id"][sample_index])
tokenized_examples["offset_mapping"][i] = [
(o if sequence_ids[k] == context_index else None)
for k, o in enumerate(tokenized_examples["offset_mapping"][i])
]
return tokenized_examples
And prepare the TrainingArguments
and Trainer
like this:
training_args = TrainingArguments(
output_dir=os.path.join(save_dir, "checkpoints"),
overwrite_output_dir=True,
do_train=True,
do_eval=True,
do_predict=False,
evaluation_strategy=IntervalStrategy.STEPS,
per_device_train_batch_size=batch_size,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=3,
gradient_accumulation_steps=1,
learning_rate=learning_rate,
lr_scheduler_type=SchedulerType.LINEAR,
eval_steps=eval_every,
save_steps=eval_every,
warmup_ratio=warmup_ratio,
num_train_epochs=n_epochs,
weight_decay=weight_decay,
load_best_model_at_end=True,
logging_dir=os.path.join(save_dir, "logs"),
logging_strategy=IntervalStrategy.STEPS,
logging_steps=params.eval_every,
report_to=["tensorboard"],
label_names=["start_positions", "end_positions"],
metric_for_best_model="f1",
log_on_each_node=False,
seed=42,
data_seed=42,
)
metric = load_metric("squad")
def compute_metrics(p: EvalPrediction):
return metric.compute(predictions=p.predictions, references=p.label_ids)
trainer = train_handler.QuestionAnsweringTrainer(
model=slgqa_model,
args=training_args,
train_dataset=train_features,
eval_dataset=eval_features,
eval_examples=eval_examples,
data_collator=default_data_collator,
tokenizer=tokenizer,
post_process_function=eval_handler.post_process_function, # taken from utils_qa.py
compute_metrics=compute_metrics
)
And I am always using distributed learning to start the training process.
When it works
When I apply one GPU to train the script, no problem during training and I can check the results with tensorboard
during training nicely : )
Problems:
When I apply two GPUs to train the script…
And when it hits the eval_steps, it starts computing the logits and post-process the logits to predictions, etc… those went through without problems…
And after that, it threw an error like this:
Traceback (most recent call last):
File "./examples/train_slgqa.py", line 125, in <module>
trainer.train()
File "/home_expes/guk06997/guk06997/softLabel-guided-QA-model/env/lib/python3.8/site-packages/transformers/trainer.py", line 1501, in train
return inner_training_loop(
File "/home_expes/guk06997/guk06997/softLabel-guided-QA-model/env/lib/python3.8/site-packages/transformers/trainer.py", line 1826, in _inner_training_loop self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/home_expes/guk06997/guk06997/softLabel-guided-QA-model/env/lib/python3.8/site-packages/transformers/trainer.py", line 2093, in _maybe_log_save_evaluate
self._save_checkpoint(model, trial, metrics=metrics)
File "/home_expes/guk06997/guk06997/softLabel-guided-QA-model/env/lib/python3.8/site-packages/transformers/trainer.py", line 2198, in _save_checkpoint
metric_value = metrics[metric_to_check]
KeyError: 'eval_f1'
So, I debugged a bit and found that in LOCAL_RANK=1
the trainer still tries to save the checkpoint…
the default save_on_each_node
is False so I thought it should be handled already…
(sorry for the places where I wasn’t clear, you can ask for more information)
Could you help me with this problem, please?