Hey everyone,
Actually, I am trying to accommodate multiple subtasks including STS-b and NER into a multi-task model, however am unable to issue the tokens from the conll dataset into the DataCollator. Can anyone help me with this? The code snippet is shown below.
class NLPDataCollator(DefaultDataCollator):
"""
Extending the existing DataCollator to work with NLP dataset batches
"""
def collate_batch(self, features: List[Union[InputDataClass, Dict]]) -> Dict[str, torch.Tensor]:
first = features[0]
if isinstance(first, dict):
# NLP data sets current works presents features as lists of dictionary
# (one per example), so we will adapt the collate_batch logic for that
if "labels" in first and first["labels"] is not None:
if first["labels"].dtype == torch.int64:
labels = torch.tensor([f["labels"] for f in features], dtype=torch.long)
else:
labels = torch.tensor([f["labels"] for f in features], dtype=torch.float)
batch = {"labels": labels}
for k, v in first.items():
if k != "labels" and v is not None and not isinstance(v, str):
batch[k] = torch.stack([f[k] for f in features])
return batch
else:
# otherwise, revert to using the default collate_batch
return DefaultDataCollator().collate_batch(features)