I am getting this error when I run the training on flan-t5-base llm. Using Hugging face transformers. I tried multiple options to fix this compute_matrics but I am hitting a brick wall. I am a high school user of transformer for a science project. I am using google colab pro version of my parent.
Error I am getting is
ValueError Traceback (most recent call last)
in <cell line: 2>()
1 # Start training
----> 2 trainer.train()
9 frames
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1537 hf_hub_utils.enable_progress_bars()
1538 else:
→ 1539 return inner_training_loop(
1540 args=args,
1541 resume_from_checkpoint=resume_from_checkpoint,
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1942
1943 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
→ 1944 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
1945
1946 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)
2289 metrics = None
2290 if self.control.should_evaluate:
→ 2291 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
2292 self._report_to_hp_search(trial, self.state.global_step, metrics)
2293
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
3093
3094 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
→ 3095 output = eval_loop(
3096 eval_dataloader,
3097 description=“Evaluation”,
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
3384 )
3385 else:
→ 3386 metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
3387 else:
3388 metrics = {}
in compute_metrics(eval_pred)
24 logits, labels = eval_pred
25 # Convert logits to predicted token IDs
—> 26 predictions = np.argmax(logits, axis=-1)
27 # Flatten the predictions and labels for comparison
28 true_labels = labels.flatten()
/usr/local/lib/python3.10/dist-packages/numpy/core/overrides.py in argmax(*args, **kwargs)
/usr/local/lib/python3.10/dist-packages/numpy/core/fromnumeric.py in argmax(a, axis, out, keepdims)
1214 “”"
1215 kwds = {‘keepdims’: keepdims} if keepdims is not np._NoValue else {}
→ 1216 return _wrapfunc(a, ‘argmax’, axis=axis, out=out, **kwds)
1217
1218
/usr/local/lib/python3.10/dist-packages/numpy/core/fromnumeric.py in _wrapfunc(obj, method, *args, **kwds)
52 bound = getattr(obj, method, None)
53 if bound is None:
—> 54 return _wrapit(obj, method, *args, **kwds)
55
56 try:
/usr/local/lib/python3.10/dist-packages/numpy/core/fromnumeric.py in _wrapit(obj, method, *args, **kwds)
41 except AttributeError:
42 wrap = None
—> 43 result = getattr(asarray(obj), method)(*args, **kwds)
44 if wrap:
45 if not isinstance(result, mu.ndarray):
ValueError: could not broadcast input array from shape (30,512,32128) into shape (30,512)
My code is this.
Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, AutoTokenizer
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
“”"# Load and prepare datasets
Load Datasets
load the CSV files and you will need to combine the seperate files into a single file and then split it into Train, Validate and Test
“”"
Load datasets
def load_dataset(file_path):
df = pd.read_csv(file_path, sep = ‘|’)
df.columns = [‘text’, ‘labels’] # Assuming two columns: ‘input’ and ‘output’
return df
“”“Split the data set into Train, validate and test”“”
Split dataset
def split_dataset(df):
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
return map(Dataset.from_pandas, (train_df, val_df, test_df))
“”“Now we load the data and complete the split”“”
Load and split each dataset
sThread_train, sThread_val, sThread_test = split_dataset(load_dataset(‘tags.csv’))
dThread_train, dThread_val, dThread_test = split_dataset(load_dataset(‘tagd.csv’))
tThread_train, tThread_val, tThread_test = split_dataset(load_dataset(‘tagn.csv’))
display(sThread_train)
“”“Combine the split data sets”“”
combined_dataset = DatasetDict({
‘train’: Dataset.from_dict(pd.concat([sThread_train.to_pandas(), dThread_train.to_pandas(), tThread_train.to_pandas()])),
‘validation’: Dataset.from_dict(pd.concat([sThread_val.to_pandas(), dThread_val.to_pandas(), tThread_val.to_pandas()])),
‘test’: Dataset.from_dict(pd.concat([sThread_test.to_pandas(), dThread_test.to_pandas(), tThread_test.to_pandas()]))
})
display(combined_dataset)
“”"
def filter_non_strings(dataset):
return dataset.filter(lambda example: isinstance(example[‘text’], str) and isinstance(example[‘labels’], str))
“”"
Apply the filtering function to each split in the dataset
#filtered_dataset = DatasetDict({split: filter_non_strings(combined_dataset[split]) for split in combined_dataset.keys()})
#display(filtered_dataset)
“”"# Setup Tokenizer and Initiatlization
Initialize the tokenizer and model
“”"
Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(‘google/flan-t5-base’)
model = T5ForConditionalGeneration.from_pretrained(‘google/flan-t5-base’)
“”“Tokenization function”“”
Tokenization function
def tokenize_function(examples):
model_inputs = tokenizer(examples[‘text’], padding=‘max_length’, truncation=True, max_length=512)
with tokenizer.as_target_tokenizer():
labels = tokenizer(examples[‘labels’], padding=‘max_length’, truncation=True, max_length=512)
model_inputs[‘labels’] = labels[‘input_ids’]
return model_inputs
“”"
def tokenize_function(examples):
model_inputs = tokenizer(examples[‘text’], padding=‘max_length’, truncation=True, max_length=512)
labels = tokenizer(examples[‘labels’], padding=‘max_length’, truncation=True, max_length=512)
return model_inputs, labels
def tokenize_function(examples):
assert isinstance(examples[‘text’], list), “Text data is not in list format”
assert all(isinstance(text, str) for text in examples[‘text’]), “Not all text entries are strings”
# Tokenize the text and labels together in the model_inputs dictionary
model_inputs = tokenizer(examples[‘text’], padding=‘max_length’, truncation=True, max_length=512)
# Tokenize labels and add them to model_inputs under the key ‘labels’
labels = tokenizer(examples[‘labels’], padding=‘max_length’, truncation=True, max_length=512)
model_inputs[‘labels’] = labels[‘input_ids’]
return model_inputs
“”"
“”“Tokenize data sets”“”
Tokenize datasets - combined does not work
tokenized_datasets = combined_dataset.map(tokenize_function, batched=True)
Then, apply the tokenize_function to the filtered dataset
#tokenized_datasets = filtered_dataset.map(tokenize_function, batched=True)
“”“Training arguments”“”
Training arguments
training_args = TrainingArguments(
output_dir=“./results”,
evaluation_strategy=“epoch”,
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
save_strategy=“epoch”,
load_best_model_at_end=True,
)
“”“Define a function to compute accuracy”“”
prompt: how to find the shape of the shape of the input array of the training set
display(tokenized_datasets)
Define a function to compute accuracy
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=2) # Fix the axis
accuracy = accuracy_score(labels, predictions)
recall = recall_score(labels, predictions, average=‘weighted’) # Use ‘binary’ for binary classification
return {“accuracy”: accuracy, “recall”: recall}
“”“Initialize Trainer”“”
Initialize Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets[‘train’],
eval_dataset=tokenized_datasets[‘validation’],
compute_metrics=compute_metrics
)
“”“Start Training”“”
Start training
trainer.train()