The script now works with a modification by Yasmin Moslem. I post it below in case others have a simil;ar issue:
import datasets
from transformers import AutoTokenizer
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from random import randrange
luganda_dataset = load_dataset("csv", data_files="Luganda.csv")
luganda_dataset = luganda_dataset["train"].map(lambda ex, i: {"id": i, "translation": dict(ex)}, remove_columns=["en", "lg"], features=datasets.Features({"id": datasets.Value("string"), "translation": datasets
.Translation(languages=["en", "lg"])}), with_indices=True,)
luganda_dataset = luganda_dataset.train_test_split(test_size=0.2)
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-lg")
source_lang = "en"
target_lang = "lg"
prefix = "translate English to Luganda: "
def preprocess_function(examples):
inputs = []
targets = []
for example in examples["translation"]:
if example[source_lang] is not None and example[target_lang] is not None and \
len(example[source_lang].strip()) > 3 and len(example[target_lang].strip()) > 3:
inputs.append(prefix + example[source_lang].strip())
targets.append(example[target_lang].strip())
else:
"There is an issue with this segment:"
print("Source:", example[source_lang])
print("Target:", example[target_lang])
random_num = randrange(10000)
print("Replaced with", random_num)
inputs.append(prefix + str(random_num))
targets.append(str(random_num))
model_inputs = tokenizer(inputs, max_length=128, truncation=True)
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=128, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
print(luganda_dataset.map)
tokenized_luganda = luganda_dataset.map(preprocess_function, batched=True)
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-lg")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
training_args = Seq2SeqTrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=10,
fp16=True,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_luganda["train"],
eval_dataset=tokenized_luganda["test"],
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()