I was attempting to implement unsupervised data augmentation (UDA) with Roberta. Essentially, it is semi-supervised learning technique where in each iteration, a mini-batch of labeled data is processed as well as unlabeled. The model’s output to the mini-batch of unlabeled data is then compared to the model’s output on an augmented version of the unlabeled data. The idea is to make the model invariant across insignificant differences.
However, when I was trying to implement this approach, I found that the evaluation f1 score was worse than when not using UDA (both approaches are included in the train function below). In fact, I found much more success when first going through all of the labeled data for a certain number of epochs before transitioning to using all of the unlabeled data (not included in the function below).
I was wondering if anyone had any insight into why my implementation of UDA was doing so poorly.
I made a custom HuggingFace Trainer module and here is a relevant part:
def train(self, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None, **kwargs):
# Initialize optimizer and scheduler
num_training_steps = len(self.get_train_dataloader()) * self.args.num_train_epochs
self.create_optimizer_and_scheduler(num_training_steps)
model = self.model
model = model.to('cuda')
if not UDA:
[... not relevant ...]
else:
for epoch in range(int(self.args.num_train_epochs)):
loader_a = self.get_train_dataloader()
augmented_df = self.dataset_b
for (i, batch), (j, batch_df) in zip(enumerate(loader_a), enumerate(generate_random_batches(augmented_df, UNSUPERVISED_BATCH_SIZE))):
### Labeled part! ###
batch = batch.to('cuda')
outputs = model(**batch)
loss = outputs.loss
print(f"{epoch} Supervised loss: {loss}")
loss.backward()
self.optimizer.step()
self.lr_scheduler.step()
self.optimizer.zero_grad()
### Unlabeled part! ###
temp_df_v1 = batch_df[["sentence", CATEGORY]]
temp_df_v1 = convertDataFrametoDatasetObject(temp_df_v1)
batch = {}
batch["input_ids"] = temp_df_v1["input_ids"]
batch["input_ids"] = batch["input_ids"].to('cuda')
batch["attention_mask"] = temp_df_v1["attention_mask"]
batch["attention_mask"] = batch["attention_mask"].to('cuda')
batch["labels"] = temp_df_v1["labels"]
batch["labels"] = batch["labels"].to('cuda')
# Finetune on augmented portion with labels retrieved
outputs = model(**batch)
logits = outputs.get('logits')
batch_df["Model Outputs"] = None
logit_index = 0
for index, row in batch_df.iterrows():
value = logits[logit_index].softmax(dim=-1).detach().cpu().flatten().numpy().tolist()
batch_df.loc[index, "Model Outputs"] = "yes" if value[1] > 0.5 else "no"
logit_index += 1
batch_df["sentence"] = batch_df["augmented_sentence"]
batch_df[CATEGORY] = batch_df["Model Outputs"]
temp_df_v2 = batch_df[["sentence", CATEGORY]]
temp_df_v2 = convertDataFrametoDatasetObject(temp_df_v2)
batch = {}
batch["input_ids"] = temp_df_v2["input_ids"]
batch["input_ids"] = batch["input_ids"].to('cuda')
batch["attention_mask"] = temp_df_v2["attention_mask"]
batch["attention_mask"] = batch["attention_mask"].to('cuda')
batch["labels"] = temp_df_v2["labels"]
batch["labels"] = batch["labels"].to('cuda')
outputs = model(**batch)
loss_unsupervised = outputs.loss
print(f"{epoch} -- Unsupervised loss: {loss_unsupervised}")
loss_unsupervised.backward()
self.optimizer.step()
self.lr_scheduler.step()
self.optimizer.zero_grad()
print("Training is Done")