Hi, current trying to reproduce the paper “Unsupervised Translation of Programming Languages” (TransCoder) from Facebook research, but using the T5 network as the seq2seq model. Right now I am stuck on the back translation part from the approach:
def back_translate(self, batch: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
device = batch[0]['input_ids'].device
print(device)
input_ids = torch.stack([example['input_ids'] for example in batch])
target_ids = torch.stack([example['target_ids'] for example in batch])
attention_mask = torch.stack([example['attention_mask'] for example in batch])
batch_langs = [self.tokenizer.decode([ids[0].detach()]) for ids in target_ids]
lang = random.choice(self.langs)
self.model.eval()
cpu_model = self.model.to('cpu')
outputs = cpu_model.generate(
input_ids = input_ids, attention_mask = attention_mask,
decoder_start_token_id = self.tokenizer.encode(f'<{lang}>')[0], max_length = 256
)
self.model.train()
inputs = [self.tokenizer.decode(ids).replace('complete: ', '') for ids in input_ids]
outputs = [self.tokenizer.decode(ids[1:]) for ids in outputs] # remove lang token
examples = []
for inpt, outpt, l in zip(outputs, inputs, batch_langs):
inpt = f'complete: {inpt} </s>'
outpt = f'<{l}>{outpt}'
input_encodings = self.tokenizer.encode_plus(inpt, pad_to_max_length = True, max_length = 256, truncation = True)
target_encodings = self.tokenizer.encode_plus(outpt, pad_to_max_length = True, max_length = 256, truncation = True)
encodings = {
'input_ids': torch.tensor(input_encodings['input_ids'], dtype=torch.long, device = xm.xla_device()),
'attention_mask': torch.tensor(input_encodings['attention_mask'], dtype=torch.long, device = xm.xla_device()),
'target_ids': torch.tensor(target_encodings['input_ids'], dtype=torch.long, device = xm.xla_device()),
'target_attention_mask': torch.tensor(target_encodings['attention_mask'], dtype=torch.long, device = xm.xla_device())
}
examples.append(encodings)
input_ids = torch.stack([example['input_ids'] for example in examples])
input_ids, _ = self.masked_data_collator.mask_tokens(input_ids)
lm_labels = torch.stack([example['target_ids'] for example in examples])
lm_labels[lm_labels[:, :] == 0] = -100
attention_mask = torch.stack([example['attention_mask'] for example in examples])
decoder_attention_mask = torch.stack([example['target_attention_mask'] for example in examples])
print(input_ids.device, xm.xla_device())
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'lm_labels': lm_labels,
'decoder_attention_mask': decoder_attention_mask
}
I am training this in Google colab using TPUs, but even though I am explicitly putting the tensors onto the TPU device, it is giving me an error saying: Input tensor is not an XLA tensor: torch.FloatTensor
Here is a link to the full colab notebook: https://colab.research.google.com/drive/1nRGkCdei7D6v6njKWPVZZWtGgPAedkVQ?usp=sharing
Any help or advice would be greatly appreciated!