Hello! I’d love some help resolving this error.
I’m fine-tuning the CLAP model for an audio classification task using transformers and a custom classification layer built using torch. I’m stuck in my troubleshooting of an error relating to the is_longer variable, which the CLAP processor produces when encoding the data. There are no examples longer than the maximum length set during preprocessing, so all values for is_longer should be False. Therefore, fusion should not be enabled.
However, self.enable_fusion seems to be true, raising and error when trying to use the is_longer value. I don’t need fusion. I’ve tried to explicitly set clap_model.config.enable_fusion = False as well as manually set the value of is_longer to False. Nothing has changed the error, and I’d appreciate some insight about where my code falls short. CLAP
# Define data preprocessing function
max_duration = 6.0 # seconds, based on the max length example in training data
def preprocess_function(examples):
audio_arrays = [x["array"] for x in examples["audio"]]
inputs = processor(
audios = audio_arrays,
sampling_rate = sampling_rate,
return_tensors="np",
padding=True,
truncation = True,
max_length = int(sampling_rate * max_duration),
)
return inputs
# Preprocess the entire dataset once
dataset_encoded = dataset.map(
preprocess_function,
remove_columns = ["audio"],
batched = True,
batch_size = 200,
num_proc = 1
)
# Add post-processing to set `is_longer` to False
def set_is_longer_false(example):
example["is_longer"] = torch.tensor([False])
return example
# Apply the post-processing function to set `is_longer` to False for each example
dataset_encoded = dataset_encoded.map(
set_is_longer_false,
batched=False # Process each example independently
)
# Verify that CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"
# Set format to PyTorch tensors and specify device as "cuda"
dataset_encoded.set_format(type="torch",
columns=["input_features", "label", "is_longer"],
device="cuda")
# Print the format and dimensions of input_features for verification
sample = dataset_encoded[0] # Access a sample for verification
print("Sample input_features format:", type(sample["input_features"])) # Should show <class 'torch.Tensor'>
print("Sample input_features shape:", sample["input_features"].shape) # Should confirm dimensions (e.g., [1, 512, 512])
# Get the unique podcast IDs
unique_podcast_ids = sorted(set(dataset_encoded["podcast_id"]))
print("The podcast IDs are:", unique_podcast_ids)
# Function to create train/test split based on podcast ID
def leave_one_out_split(dataset, test_podcast_id):
print("Using single processor for filtering")
# Perform the filtering while keeping tensors on the CPU
train_dataset = dataset.filter(lambda example: example["podcast_id"] != test_podcast_id)
test_dataset = dataset.filter(lambda example: example["podcast_id"] == test_podcast_id)
return DatasetDict({"train": train_dataset, "test": test_dataset})
# define a custom classification head for CLAP model
class CLAPAudioClassifier(nn.Module):
def __init__(self, clap_model, num_labels):
super(CLAPAudioClassifier, self).__init__()
self.clap_model = clap_model
self.classifier = nn.Linear(clap_model.config.projection_dim, num_labels)
def forward(self, input_features, labels):
# Explicitly set is_longer to False and enable_fusion to False
is_longer = torch.tensor([False], device=input_features.device)
self.clap_model.enable_fusion = False
print("Fusion enabled:", self.clap_model.config.enable_fusion)
# Get audio embeddings from CLAP with is_longer set
audio_embeddings = self.clap_model.get_audio_features(
input_features=input_features,
is_longer=is_longer
)
# get audio embeddings from CLAP
audio_embeddings = self.clap_model.get_audio_features(input_features=dataset_encoded['input_features'])
# Apply classifier head to predict logits
logits = self.classifier(audio_embeddings)
# Calculate loss
loss = F.cross_entropy(logits, labels)
return {"loss": loss, "logits": logits}
# Define training arguments
def create_training_args(podcast_id):
model_name = model_id.split("/")[-1]
output_dir = f"/content/drive/My Drive/Thesis/crossval/{model_name}_gender_loo_cv_podcast_{podcast_id}"
batch_size = 64
gradient_accumulation_steps = 1
num_train_epochs = 10
args = TrainingArguments(
output_dir,
eval_strategy ="epoch",
save_strategy ="epoch",
learning_rate = 5e-5,
per_device_train_batch_size = batch_size,
gradient_accumulation_steps = gradient_accumulation_steps,
per_device_eval_batch_size = batch_size,
num_train_epochs = num_train_epochs,
warmup_ratio = 0.1,
logging_steps = 5,
load_best_model_at_end = True,
metric_for_best_model = "f1",
fp16 = True,
push_to_hub = False,
report_to="tensorboard", # Enable faster transfers to GPU
)
# Use `set_dataloader` to set `pin_memory=False`
args = args.set_dataloader(
train_batch_size=batch_size,
eval_batch_size=batch_size,
pin_memory=False # Disable pinning memory for DataLoader
)
return args
# Define evaluation metrics
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions = np.argmax(eval_pred.predictions, axis=1)
precision, recall, f1, _ = precision_recall_fscore_support(eval_pred.label_ids, predictions, average = 'macro', zero_division = 1)
accuracy = metric.compute(predictions = predictions, references = eval_pred.label_ids)['accuracy']
return {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
}
# Function to run a fold
def run_fold(podcast_id):
print(f"Training, leaving out podcast {podcast_id}")
# Split dataset
dataset_split = leave_one_out_split(dataset_encoded, podcast_id)
# Count labels in the training data and convert tensor labels to integers
label_counts = Counter(int(label.item()) for label in dataset_split["train"]["label"])
# Map the label IDs to their string names
label_counts_mapped = {id2label[label]: count for label, count in label_counts.items()}
# Print label counts
print(f"Label counts in training data:")
for label, count in label_counts_mapped.items():
print(f"{label}: {count}")
# Calculate the total count of all interruptions
interruptions = sum(count for label, count in label_counts_mapped.items() if label != "noninterruption")
print(f"Interruptions: {interruptions}")
# Reinitialize model to prevent leakage
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clap_model = ClapModel.from_pretrained(model_id).to(device)
clap_model.config.enable_fusion = False
model = CLAPAudioClassifier(clap_model, num_labels=num_labels).to(device)
print("Enable fusion status:", clap_model.config.enable_fusion) # Should print False
# Initialize training arguments
training_args = create_training_args(podcast_id)
# Initialize trainer
trainer = Trainer(
model = model,
args = training_args,
train_dataset = dataset_split["train"],
eval_dataset = dataset_split["test"],
tokenizer = processor,
compute_metrics = compute_metrics,
)
trainer.train()
run_fold(unique_podcast_ids[0])
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-24-86cdd38ddcc9> in <cell line: 1>()
----> 1 run_fold(unique_podcast_ids[0])
17 frames
/usr/local/lib/python3.10/dist-packages/transformers/models/clap/modeling_clap.py in forward(self, input_features, is_longer, head_mask, output_attentions, output_hidden_states, output_hidden_states_before_downsampling, always_partition, return_dict)
909 is_longer_list_idx = None
910 if self.enable_fusion:
--> 911 is_longer_list = is_longer.to(input_features.device)
912 is_longer_list_idx = torch.where(is_longer_list == 1)[0]
913
AttributeError: 'NoneType' object has no attribute 'to'