Text summarization model training problem - trainer.train() [00:00<?, ?it/s]

eoplumbum · July 15, 2024, 6:43am

Hello

I am learning from NLP HF course, and after first part, I wanted to train a model for summarization task on this dataset:

Dowloading and mapping works well, however the training bar do not even start progressing. I am working on SSH, with GPU, in conda env with:

Name Version Build Channel
_libgcc_mutex 0.1 main
_openmp_mutex 5.1 1_gnu
absl-py 2.1.0 pypi_0 pypi
accelerate 0.30.1 pypi_0 pypi
astunparse 1.6.3 pypi_0 pypi
attrs 23.2.0 pypi_0 pypi
bzip2 1.0.8 h5eee18b_6
ca-certificates 2024.3.11 h06a4308_0
certifi 2024.6.2 pypi_0 pypi
click 8.1.7 pypi_0 pypi
expat 2.6.2 h6a678d5_0
flatbuffers 24.3.25 pypi_0 pypi
gast 0.5.4 pypi_0 pypi
google-pasta 0.2.0 pypi_0 pypi
grpcio 1.64.1 pypi_0 pypi
h5py 3.11.0 pypi_0 pypi
idna 3.7 pypi_0 pypi
jinja2 3.1.4 pypi_0 pypi
joblib 1.4.2 pypi_0 pypi
keras 3.3.3 pypi_0 pypi
ld_impl_linux-64 2.38 h1181459_1
libclang 18.1.1 pypi_0 pypi
libffi 3.4.4 h6a678d5_1
libgcc-ng 11.2.0 h1234567_1
libgomp 11.2.0 h1234567_1
libstdcxx-ng 11.2.0 h1234567_1
libuuid 1.41.5 h5eee18b_0
markdown 3.6 pypi_0 pypi
markdown-it-py 3.0.0 pypi_0 pypi
markupsafe 2.1.5 pypi_0 pypi
mdurl 0.1.2 pypi_0 pypi
ml-dtypes 0.3.2 pypi_0 pypi
mpmath 1.3.0 pypi_0 pypi
namex 0.0.8 pypi_0 pypi
ncurses 6.4 h6a678d5_0
networkx 3.3 pypi_0 pypi
nltk 3.8.1 pypi_0 pypi
nvidia-cublas-cu12 12.1.3.1 pypi_0 pypi
nvidia-cuda-cupti-cu12 12.1.105 pypi_0 pypi
nvidia-cuda-nvrtc-cu12 12.1.105 pypi_0 pypi
nvidia-cuda-runtime-cu12 12.1.105 pypi_0 pypi
nvidia-cudnn-cu12 8.9.2.26 pypi_0 pypi
nvidia-cufft-cu12 11.0.2.54 pypi_0 pypi
nvidia-curand-cu12 10.3.2.106 pypi_0 pypi
nvidia-cusolver-cu12 11.4.5.107 pypi_0 pypi
nvidia-cusparse-cu12 12.1.0.106 pypi_0 pypi
nvidia-nccl-cu12 2.20.5 pypi_0 pypi
nvidia-nvjitlink-cu12 12.5.40 pypi_0 pypi
nvidia-nvtx-cu12 12.1.105 pypi_0 pypi
openssl 3.0.13 h7f8727e_2
opt-einsum 3.3.0 pypi_0 pypi
optree 0.11.0 pypi_0 pypi
pip 24.0 py310h06a4308_0
protobuf 4.25.3 pypi_0 pypi
psutil 5.9.8 pypi_0 pypi
pygments 2.18.0 pypi_0 pypi
python 3.10.14 h955ad1f_1
pytz 2024.1 pypi_0 pypi
pyyaml 6.0.1 pypi_0 pypi
readline 8.2 h5eee18b_0
rich 13.7.1 pypi_0 pypi
rouge-score 0.1.2 pypi_0 pypi
scikit-learn 1.5.0 pypi_0 pypi
scipy 1.14.0 pypi_0 pypi
setuptools 69.5.1 py310h06a4308_0
six 1.16.0 pypi_0 pypi
sqlite 3.45.3 h5eee18b_0
sympy 1.12.1 pypi_0 pypi
tensorboard 2.16.2 pypi_0 pypi
tensorboard-data-server 0.7.2 pypi_0 pypi
tensorflow 2.16.1 pypi_0 pypi
tensorflow-io-gcs-filesystem 0.37.0 pypi_0 pypi
termcolor 2.4.0 pypi_0 pypi
tf-keras 2.16.0 pypi_0 pypi
threadpoolctl 3.5.0 pypi_0 pypi
tk 8.6.14 h39e8969_0
torch 2.3.1 pypi_0 pypi
triton 2.3.1 pypi_0 pypi
tzdata 2024a h04d1e81_0
urllib3 2.2.1 pypi_0 pypi
werkzeug 3.0.3 pypi_0 pypi
wheel 0.43.0 py310h06a4308_0
wrapt 1.16.0 pypi_0 pypi
xz 5.4.6 h5eee18b_1
zlib 1.2.13 h5eee18b_1

# Text summarization on polish news dataset

from datasets import load_dataset
from transformers import AutoTokenizer    
import torch
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np


import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# choose GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# load dataset
# dataset contains only a "train" split, so we need to split this
# train: Dataset({features: ['link', 'title', 'headline', 'content'], num_rows: 248123
raw_dataset = load_dataset("WiktorS/polish-news", split="train").shuffle(seed=42).select(range(100)) # choses which part of split to load #select 100 random
raw_dataset = raw_dataset.train_test_split(test_size=0.1) # split="train" - necessary to load in the purpose of future splitting

# dataset preprocessing: delete None Types in headline and content columns

def filter_none(example):
    return example['headline'] is not None and example['content'] is not None

raw_dataset = raw_dataset.filter(filter_none)

# choose model

checkpoint="facebook/bart-large-cnn"

# tokenize

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# tokenization function (mapping)

# Prefix the input with a prompt so model knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
prefix = "summarize: "

def tokenize_function(example):
    
    inputs = [prefix + doc for doc in example["content"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True) #dictionary with keys: input_ids, attention_mask
    #tokenize not only the input content but also labels
    labels = tokenizer(text_target=example["headline"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"] # add labels to the dictionary
    return model_inputs # return dictionary with keys: input_ids, attention_mask, labels


tokenize_dataset = raw_dataset.map(tokenize_function, batched=True) #features: ['link', 'title', 'headline', 'content', 'input_ids', 'attention_mask', 'labels']
print(tokenize_dataset["train"][0])

# create a batch of examples; dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length
# data collator to load batch into tokanization function

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

# arguments for trainer

training_args = Seq2SeqTrainingArguments(
    output_dir="final-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
)

# evaluation fuction included in the trainer. Including a metric during training is often helpful for evaluating your model’s performance. 

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) #Converts tokens to human-readable text, skipping special tokens.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id) #Replaces -100 values in labels with pad tokens so they can be decode
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) #Computes the ROUGE metric for the model's predictions

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] #Counts the number of non-pad tokens in each prediction
    result["gen_len"] = np.mean(prediction_lens) #Calculates the average number of non-pad tokens in the predictions

    return {k: round(v, 4) for k, v in result.items()} #Returns the ROUGE metric results rounded to four decimal places

# initialize the model

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)


# trainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_dataset["train"],
    eval_dataset=tokenize_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    #compute_metrics=compute_metrics,
)

# train model

trainer.train()

Topic		Replies	Views
Problem fine-tuning a model with Seq2Seq Trainer Beginners	1	971	June 25, 2023
NLP for Summarization and classification Beginners	4	54	January 22, 2025
Audio Course: Unit 6 Unable to train Speech T5 Course	3	524	January 23, 2024
HF Trainer progress bar not progressing after first epoch 🤗Transformers	0	1937	May 10, 2023
Best model for summarization Beginners	1	4276	April 6, 2023

Text summarization model training problem - trainer.train() [00:00<?, ?it/s]

Related topics