Thank you for taking the trouble to answer my query :-). I have incorporated your suggestion in my script yet I am still getting KeyError(‘translation’). See below with a Traceback:
import datasets
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
luganda_dataset = load_dataset(“csv”, data_files=“Luganda.csv”)
luganda_dataset.map(lambda ex, i: {“id”: i, “translation”: dict(ex)}, remove_columns=[“en”, “lg”], features=datasets.Features({“id”: datasets.Value(“string”), “translation”: datasets
.Translation(languages=[“en”, “lg”])}), with_indices=True,)
luganda_dataset = luganda_dataset[“train”].train_test_split(test_size=0.2)
tokenizer = AutoTokenizer.from_pretrained("./opus-mt-en-lg")
source_lang = “en”
target_lang = “lg”
prefix = "translate English to Luganda: "
def preprocess_function(examples):
inputs = [prefix + example[source_lang] for example in examples[“translation”]]
targets = [example[target_lang] for example in examples[“translation”]]
model_inputs = tokenizer(inputs, max_length=128, truncation=True)
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=128, truncation=True)
model_inputs[“labels”] = labels[“input_ids”]
return model_inputs
tokenized_luganda = luganda_dataset.map(preprocess_function, batched=True)
model = AutoModelForSeq2SeqLM.from_pretrained(“t5-small”)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
evaluation_strategy=“epoch”,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=10,
fp16=True,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_luganda[“train”],
eval_dataset=tokenized_luganda[“test”],
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
Traceback (most recent call last)
File “finetune_luganda.py”, line 28, in
tokenized_luganda = luganda_dataset.map(preprocess_function, batched=True)
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/dataset_dict.py”, line 512, in map
for k, dataset in self.items()
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/dataset_dict.py”, line 512, in
for k, dataset in self.items()
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/arrow_dataset.py”, line 2120, in map
desc=desc,
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/arrow_dataset.py”, line 518, in wrapper
out: Union[“Dataset”, “DatasetDict”] = func(self, *args, **kwargs)
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/arrow_dataset.py”, line 485, in wrapper
out: Union[“Dataset”, “DatasetDict”] = func(self, *args, **kwargs)
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/fingerprint.py”, line 413, in wrapper
out = func(self, *args, **kwargs)
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/arrow_dataset.py”, line 2485, in _map_single
offset=offset,
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/arrow_dataset.py”, line 2367, in apply_function_on_filtered_inputs
processed_inputs = function(*fn_args, *additional_args, **fn_kwargs)
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/arrow_dataset.py”, line 2062, in decorated
result = f(decorated_item, *args, **kwargs)
File “finetune_luganda.py”, line 20, in preprocess_function
inputs = [prefix + example[source_lang] for example in examples[“translation”]]
File “/home/tel34/venv/lib/python3.6/site-packages/datasets/arrow_dataset.py”, line 123, in getitem
values = super().getitem(key)
File “/usr/lib/python3.6/collections/init.py”, line 991, in getitem
raise KeyError(key)
KeyError: ‘translation’