Fine tune T5 hit error in predictions = np.argmax(predictions, axis=1)

below is my code for fine tune T5, it hits error in predictions = np.argmax(predictions, axis=1)
predictions

  • The first element of the predictions tuple has a shape of (4, 512, 32128).
  • The second element of the predictions tuple has a shape of (4, 512, 768)
    but they are not equal.
    and error is could not broadcast input array from shape (4,512,32128) into shape (4,512)
    so my guess is the predictions has to tensor, one (batch_size, sequence_length, num_classes) The second element appears to have dimensions corresponding to (batch_size, sequence_length, embedding_size) or (4, 512, 768) .

hugginface doc Text classification use predictions = np.argmax(predictions, axis=1)
I change to predictions = np.argmax(predictions, axis=-1) also not work.
same ValueError: could not broadcast input array from shape (4,512,32128) into shape (4,512)

import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, AutoAdapterModel
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import get_linear_schedule_with_warmup
import torch

def load_adapter(model_path, adapter_path, adapter_name):
# Load the original model architecture
model = AutoAdapterModel.from_pretrained(model_path) if len(adapter_path)>0 else T5ForConditionalGeneration.from_pretrained(model_path)
if (len(adapter_path)>0):
model.load_adapter(adapter_path)
else:
model.add_adapter(adapter_name)
model.train_adapter(adapter_name)
# Set the active adapter for inference
model.set_active_adapters(adapter_name)
# model.freeze_model()
# Load the tokenizer
return model

def compute_metrics(eval_pred):
predictions, labels = eval_pred
print(labels)
print(type(predictions))
print(len(predictions))
print(predictions[0].shape)
print(predictions[1].shape)
if np.array_equal(predictions[0][:, :, :2], predictions[1][:, :, :2]):
print(“The first two dimensions of predictions[0] and predictions[1] are the same.”)
print (predictions[0][:, :, :2])
else:
print(“The first two dimensions of predictions[0] and predictions[1] are different.”)
print (predictions[0][:, :, :2])
print (predictions[1][:, :, :2])

compare if first two dim are the same

predictions = np.argmax(predictions, axis=1)

# Calculating metrics
accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

return {
    'accuracy': accuracy,
    'f1': f1,
    'precision': precision,
    'recall': recall,
}

MODEL = ‘t5-base’
ADAPTER_PATH = ‘./fine_tuned_adapter_m5’
ADAPTER = ‘emotion’
model = load_adapter(MODEL, ADAPTER_PATH, ADAPTER)
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
model.to(device)

Load the dataset

dataset = load_dataset(‘emotion’)

Create a function to encode the data

def label2text(label):
# ‘sadness’ ‘joy’ ‘love’ ‘anger’ ‘fear’ 'surprise
emotion_dict = {0: ‘sadness’, 1: ‘joy’, 2: ‘love’, 3: ‘anger’, 4:‘fear’, 5:‘surprise’} # Fill with your actual mapping
return emotion_dict[label]

def encode(batch, tokenizer=T5Tokenizer.from_pretrained(MODEL)):
# Create target_text which is just the emotion text for each example in the batch
target_text = [label2text(label) for label in batch[‘label’]] # Convert each emotion label to text
inputs = tokenizer(batch[‘text’], truncation=True, padding=‘max_length’, return_tensors=‘pt’)
with tokenizer.as_target_tokenizer():
labels = tokenizer(target_text, truncation=True, padding=‘max_length’, return_tensors=‘pt’)
inputs[‘labels’] = labels[‘input_ids’]
return inputs

Encode the dataset

dataset = dataset.map(encode, batched=True)

num_train_epochs = 1
learning_rate = 1e-5
train_dataset = dataset[‘train’].select(range(4))
warmup_steps = int(len(train_dataset) * num_train_epochs * 0.1) # 10% of train data for warm-up
total_steps = len(train_dataset) * num_train_epochs

eval_dataset=dataset[‘validation’].shuffle(seed=42).select(range(4))

Define the training arguments

training_args = TrainingArguments(
output_dir=‘./results’,
num_train_epochs=num_train_epochs,
evaluation_strategy=“epoch”,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
learning_rate=learning_rate,
warmup_steps=warmup_steps,
weight_decay=0.01,
logging_dir=‘./logs’,
# use_mps_device=False,
)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

Create a learning rate scheduler

lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps
)

Define the Trainer and train

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
optimizers=(optimizer, lr_scheduler),
)

trainer.train()
model.save_adapter(ADAPTER_PATH, ADAPTER)