ValueError: could not broadcast input array from shape (30,512,32128) into shape (30,512)

I am getting this error when I run the training on flan-t5-base llm. Using Hugging face transformers. I tried multiple options to fix this compute_matrics but I am hitting a brick wall. I am a high school user of transformer for a science project. I am using google colab pro version of my parent.

Error I am getting is

ValueError Traceback (most recent call last)
in <cell line: 2>()
1 # Start training
----> 2 trainer.train()

9 frames

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1537 hf_hub_utils.enable_progress_bars()
1538 else:
→ 1539 return inner_training_loop(
1540 args=args,
1541 resume_from_checkpoint=resume_from_checkpoint,

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1942
1943 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
→ 1944 self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
1945
1946 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval)
2289 metrics = None
2290 if self.control.should_evaluate:
→ 2291 metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
2292 self._report_to_hp_search(trial, self.state.global_step, metrics)
2293

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
3093
3094 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
→ 3095 output = eval_loop(
3096 eval_dataloader,
3097 description=“Evaluation”,

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
3384 )
3385 else:
→ 3386 metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
3387 else:
3388 metrics = {}

in compute_metrics(eval_pred)
24 logits, labels = eval_pred
25 # Convert logits to predicted token IDs
—> 26 predictions = np.argmax(logits, axis=-1)
27 # Flatten the predictions and labels for comparison
28 true_labels = labels.flatten()

/usr/local/lib/python3.10/dist-packages/numpy/core/overrides.py in argmax(*args, **kwargs)

/usr/local/lib/python3.10/dist-packages/numpy/core/fromnumeric.py in argmax(a, axis, out, keepdims)
1214 “”"
1215 kwds = {‘keepdims’: keepdims} if keepdims is not np._NoValue else {}
→ 1216 return _wrapfunc(a, ‘argmax’, axis=axis, out=out, **kwds)
1217
1218

/usr/local/lib/python3.10/dist-packages/numpy/core/fromnumeric.py in _wrapfunc(obj, method, *args, **kwds)
52 bound = getattr(obj, method, None)
53 if bound is None:
—> 54 return _wrapit(obj, method, *args, **kwds)
55
56 try:

/usr/local/lib/python3.10/dist-packages/numpy/core/fromnumeric.py in _wrapit(obj, method, *args, **kwds)
41 except AttributeError:
42 wrap = None
—> 43 result = getattr(asarray(obj), method)(*args, **kwds)
44 if wrap:
45 if not isinstance(result, mu.ndarray):

ValueError: could not broadcast input array from shape (30,512,32128) into shape (30,512)

My code is this.

Import necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, load_metric
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, AutoTokenizer

from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

“”"# Load and prepare datasets

Load Datasets

load the CSV files and you will need to combine the seperate files into a single file and then split it into Train, Validate and Test
“”"

Load datasets

def load_dataset(file_path):
df = pd.read_csv(file_path, sep = ‘|’)
df.columns = [‘text’, ‘labels’] # Assuming two columns: ‘input’ and ‘output’
return df

“”“Split the data set into Train, validate and test”“”

Split dataset

def split_dataset(df):
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
return map(Dataset.from_pandas, (train_df, val_df, test_df))

“”“Now we load the data and complete the split”“”

Load and split each dataset

sThread_train, sThread_val, sThread_test = split_dataset(load_dataset(‘tags.csv’))
dThread_train, dThread_val, dThread_test = split_dataset(load_dataset(‘tagd.csv’))
tThread_train, tThread_val, tThread_test = split_dataset(load_dataset(‘tagn.csv’))

display(sThread_train)

“”“Combine the split data sets”“”

combined_dataset = DatasetDict({
‘train’: Dataset.from_dict(pd.concat([sThread_train.to_pandas(), dThread_train.to_pandas(), tThread_train.to_pandas()])),
‘validation’: Dataset.from_dict(pd.concat([sThread_val.to_pandas(), dThread_val.to_pandas(), tThread_val.to_pandas()])),
‘test’: Dataset.from_dict(pd.concat([sThread_test.to_pandas(), dThread_test.to_pandas(), tThread_test.to_pandas()]))
})

display(combined_dataset)

“”"
def filter_non_strings(dataset):
return dataset.filter(lambda example: isinstance(example[‘text’], str) and isinstance(example[‘labels’], str))
“”"

Apply the filtering function to each split in the dataset

#filtered_dataset = DatasetDict({split: filter_non_strings(combined_dataset[split]) for split in combined_dataset.keys()})

#display(filtered_dataset)

“”"# Setup Tokenizer and Initiatlization

Initialize the tokenizer and model
“”"

Initialize the tokenizer and model

tokenizer = AutoTokenizer.from_pretrained(‘google/flan-t5-base’)
model = T5ForConditionalGeneration.from_pretrained(‘google/flan-t5-base’)

“”“Tokenization function”“”

Tokenization function

def tokenize_function(examples):
model_inputs = tokenizer(examples[‘text’], padding=‘max_length’, truncation=True, max_length=512)
with tokenizer.as_target_tokenizer():
labels = tokenizer(examples[‘labels’], padding=‘max_length’, truncation=True, max_length=512)
model_inputs[‘labels’] = labels[‘input_ids’]
return model_inputs
“”"

def tokenize_function(examples):
model_inputs = tokenizer(examples[‘text’], padding=‘max_length’, truncation=True, max_length=512)
labels = tokenizer(examples[‘labels’], padding=‘max_length’, truncation=True, max_length=512)
return model_inputs, labels

def tokenize_function(examples):
assert isinstance(examples[‘text’], list), “Text data is not in list format”
assert all(isinstance(text, str) for text in examples[‘text’]), “Not all text entries are strings”
# Tokenize the text and labels together in the model_inputs dictionary
model_inputs = tokenizer(examples[‘text’], padding=‘max_length’, truncation=True, max_length=512)
# Tokenize labels and add them to model_inputs under the key ‘labels’
labels = tokenizer(examples[‘labels’], padding=‘max_length’, truncation=True, max_length=512)
model_inputs[‘labels’] = labels[‘input_ids’]
return model_inputs
“”"

“”“Tokenize data sets”“”

Tokenize datasets - combined does not work

tokenized_datasets = combined_dataset.map(tokenize_function, batched=True)

Then, apply the tokenize_function to the filtered dataset

#tokenized_datasets = filtered_dataset.map(tokenize_function, batched=True)

“”“Training arguments”“”

Training arguments

training_args = TrainingArguments(
output_dir=“./results”,
evaluation_strategy=“epoch”,
learning_rate=2e-5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=3,
weight_decay=0.01,
save_strategy=“epoch”,
load_best_model_at_end=True,
)

“”“Define a function to compute accuracy”“”

prompt: how to find the shape of the shape of the input array of the training set

display(tokenized_datasets)

Define a function to compute accuracy

def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=2) # Fix the axis
accuracy = accuracy_score(labels, predictions)
recall = recall_score(labels, predictions, average=‘weighted’) # Use ‘binary’ for binary classification
return {“accuracy”: accuracy, “recall”: recall}

“”“Initialize Trainer”“”

Initialize Trainer

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets[‘train’],
eval_dataset=tokenized_datasets[‘validation’],
compute_metrics=compute_metrics
)

“”“Start Training”“”

Start training

trainer.train()

This got solved by changing the compute function to this one. I found that the logits was a tuple. And the first element was the logits, Once that was identified then I could handle it. This fixed my problem and the code completed successfully…
def compute_metrics(eval_pred):
logits, labels = eval_pred
# Assuming logits are the first element of the tuple
logits_shape = logits[0].shape if isinstance(logits, tuple) else logits.shape
#print(f"logits shape: {logits_shape}“) # This will print the shape of the logits
#print(f"labels shape: {labels.shape}”) # Continue to print labels shape

# Proceed with your metrics calculation
predictions = np.argmax(logits[0], axis=-1) if isinstance(logits, tuple) else np.argmax(logits, axis=-1)
labels = labels.flatten()
predictions = predictions.flatten()
accuracy = accuracy_score(labels, predictions)
recall = recall_score(labels, predictions, average='weighted', zero_division=0)

return {"accuracy": accuracy, "recall": recall}

This topic was automatically closed 12 hours after the last reply. New replies are no longer allowed.