Trying to run facebook/musicgen-small on CPU with 16gb RAM

I have AMD Ryzen 5 4600H with Radeon Graphics (GPU Nvidia GTX 1650ti)
I have 16GB ddr4 3200mhz MEMORY.
This is the code I’m trying to run


import os
import psutil
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, MusicgenMelodyConfig, pipeline
import gc
from torch.nn import functional as F

# Function to log memory usage
def log_memory(stage=""):
    process = psutil.Process(os.getpid())
    print(f"Memory Usage after {stage}: {process.memory_info().rss / 1024 ** 2} MB")

log_memory("initial load")

# Hugging Face token for authentication
token = "TOKEN"

# Load model configuration and manually add missing config attributes
model_name = "facebook/musicgen-small"  # Use smaller variants if available
config = MusicgenMelodyConfig.from_pretrained(model_name, token=token)

# Manually add the missing 'use_cache' attribute
config.use_cache = False  # This should resolve the AttributeError you encountered

# Manually add the missing initializer_factor if it's required
config.initializer_factor = 1.0  # Default value for initialization

# Modify configuration parameters for debugging
config.dropout = 0.1
config.layerdrop = 0.1
config.max_position_embeddings = 512  # Reduced
config.hidden_size = 128  # Smaller hidden size
config.num_codebooks = 128  # Adjusted to a smaller number for compatibility
config.scale_embedding = True
config.vocab_size = 50257
config.num_hidden_layers = 2  # Fewer layers
config.num_attention_heads = 4  # Fewer attention heads
config.attention_dropout = 0.1
config.activation_function = "gelu"
config.activation_dropout = 0.1
config.ffn_dim = 1024

log_memory("after config")

# Load model without weight tying
from transformers.models.musicgen_melody.modeling_musicgen_melody import MusicgenMelodyModel

# Override the model class to disable weight tying
class CustomMusicgenModel(MusicgenMelodyModel):
    def tie_weights(self):
        pass  # Disable weight tying

# Load the model in FP16 (half precision) for reduced memory usage
model = CustomMusicgenModel.from_pretrained(model_name, config=config, token=token, torch_dtype=torch.float16)

log_memory("after model loaded")

# Tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

# Use a pipeline for text-to-audio generation
generator = pipeline("text-to-audio", model=model, tokenizer=tokenizer, device=-1)  # Use CPU to save memory

# Set model to evaluation mode to save memory
model.eval()

# Ensure proper input shape by padding to the required size
prompt = "A relaxing jazz track with piano and bass."

# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Get the current sequence length
sequence_length = input_ids.shape[-1]

# The model expects the sequence size to be divisible by 1280 (128 * 10)
required_length = 1280 * (sequence_length // 1280 + 1)  # Round up to the next multiple of 1280

# Pad the input tensor to the required length
if sequence_length < required_length:
    padding_size = required_length - sequence_length
    input_ids = F.pad(input_ids, (0, padding_size))  # Pad the input tensor

# Ensure the padded sequence size is divisible by 1280
assert input_ids.numel() % 1280 == 0, f"Input size must be divisible by 1280, but got {input_ids.numel()}"

# Now reshape the input tensor to match the expected shape [-1, 128, 10]
input_ids = input_ids.reshape(-1, 128, 10)

# Check the shape after reshaping
print(f"Input tensor shape after reshaping: {input_ids.shape}")

# Generate audio based on input prompt with no_grad to save memory
with torch.no_grad():
    generated_audio = generator(prompt)
    print(generated_audio)

log_memory("after generation")

# Check type of the audio data
print(f"Type of generated audio: {type(generated_audio['audio'])}")

# Save the generated audio to a file
if isinstance(generated_audio['audio'], bytes):
    with open("generated_music.wav", "wb") as f:
        f.write(generated_audio['audio'])
else:
    print("Unexpected audio format, unable to save.")

# Cleanup
del generated_audio  # Explicitly delete the variable
gc.collect()  # Garbage collection
log_memory("after cleanup")

always getting error that ends with

  File "/home/ronalds/.local/lib/python3.10/site-packages/transformers/models/musicgen_melody/modeling_musicgen_melody.py", line 923, in forward
    input = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
RuntimeError: shape '[-1, 128, 10]' is invalid for input of size 10

Is there anyone who has code that works with facebook/musicgen-small model , or maybe there is models that work better and code for them?

EDIT SO I MANAGED TO GENERATE SOME AUDIO FILE BUT IT’S 0 SECONDS LONG WITH THIS CODE -


import os
import psutil
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, MusicgenMelodyConfig, pipeline
import gc
import numpy as np
from scipy.io import wavfile  # For saving audio as WAV files
import torch.nn.functional as F  # Ensure that F is imported for functional operations

# Function to log memory usage
def log_memory(stage=""):
    process = psutil.Process(os.getpid())
    print(f"Memory Usage after {stage}: {process.memory_info().rss / 1024 ** 2} MB")

log_memory("initial load")

# Hugging Face token for authentication
token = "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"

# Load model configuration and manually add missing config attributes
model_name = "facebook/musicgen-small"  # Use smaller variants if available
config = MusicgenMelodyConfig.from_pretrained(model_name, token=token)

# Manually add the missing 'use_cache' attribute
config.use_cache = False  # This should resolve the AttributeError you encountered

# Manually add the missing initializer_factor if it's required
config.initializer_factor = 1.0  # Default value for initialization

# Modify configuration parameters for debugging
config.dropout = 0.1
config.layerdrop = 0.1
config.max_position_embeddings = 512  # Reduced
config.hidden_size = 128  # Smaller hidden size
config.num_codebooks = 128  # Adjusted to a smaller number for compatibility
config.scale_embedding = True
config.vocab_size = 50257
config.num_hidden_layers = 2  # Fewer layers
config.num_attention_heads = 4  # Fewer attention heads
config.attention_dropout = 0.1
config.activation_function = "gelu"
config.activation_dropout = 0.1
config.ffn_dim = 1024

log_memory("after config")

# Load model without weight tying
from transformers.models.musicgen_melody.modeling_musicgen_melody import MusicgenMelodyModel

# Override the model class to disable weight tying
class CustomMusicgenModel(MusicgenMelodyModel):
    def tie_weights(self):
        pass  # Disable weight tying
    
    def forward(self, input_ids, attention_mask=None, **kwargs):
        # Check if the input tensor is small and needs to be reshaped
        seq_len = input_ids.shape[-1]
        required_size = 1280
        
        # Ensure the input is divisible by 1280, the total size expected by the model
        if seq_len % required_size != 0:
            # If not, pad the input to the next multiple of required_size
            pad_size = required_size - (seq_len % required_size)
            input_ids = F.pad(input_ids, (0, pad_size))  # F is now defined

        # Proceed with the regular forward pass
        input_ids = input_ids.reshape(-1, 128, 10)  # reshape as needed
        return super().forward(input_ids, attention_mask=attention_mask, **kwargs)

# Load the model in FP16 (half precision) for reduced memory usage
model = CustomMusicgenModel.from_pretrained(model_name, config=config, token=token, torch_dtype=torch.float16)

log_memory("after model loaded")

# Tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

# Use a pipeline for text-to-audio generation
generator = pipeline("text-to-audio", model=model, tokenizer=tokenizer, device=-1)  # Use CPU to save memory

# Set model to evaluation mode to save memory
model.eval()

# Ensure proper input shape by padding to the required size
prompt = "A relaxing jazz track with piano and bass."

# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Print the shape of the input tensor before reshaping
print(f"Input tensor shape before reshaping: {input_ids.shape}")

# Generate audio based on input prompt with no_grad to save memory
with torch.no_grad():
    generated_audio = generator(prompt)
    print(generated_audio)

log_memory("after generation")

# Check type of the audio data
print(f"Type of generated audio: {type(generated_audio['audio'])}")

# Inspect the generated audio content
audio_data = generated_audio['audio']
print(f"Generated audio shape before any modifications: {audio_data.shape}")
print(f"Generated audio content preview: {audio_data}")

# Check the first few values of the generated audio data for debugging
print(f"Generated audio (first 10 samples): {audio_data[:10]}")

# Check the range of values
print(f"Audio sample range: {np.min(audio_data)} to {np.max(audio_data)}")

# Check if audio is empty
if audio_data.size == 0:
    print("Error: Generated audio is empty.")
else:
    # Flatten the audio if it's not in 1D
    audio_data = audio_data.flatten()  # Flatten the array to 1D
    
    # Normalize the audio to be in the range of int16 (for WAV files)
    audio_data = np.int16(audio_data * 32767)  # Scaling to the range of int16

    # Save the audio to a WAV file
    wavfile.write("generated_music.wav", 32000, audio_data)  # 32000 is the sampling rate
    print("Audio saved as generated_music.wav")

# Cleanup
del generated_audio  # Explicitly delete the variable
gc.collect()  # Garbage collection
log_memory("after cleanup")

Any way to make it work? tried last time this code and it still generates 0 second .wav file


import os
import psutil
import torch
from transformers import AutoTokenizer, MusicgenMelodyConfig, pipeline
import gc
import numpy as np
from scipy.io import wavfile
import torch.nn.functional as F
import librosa  # Fallback for Griffin-Lim vocoder
import torch.hub

# Function to log memory usage
def log_memory(stage=""):
    process = psutil.Process(os.getpid())
    print(f"Memory Usage after {stage}: {process.memory_info().rss / 1024 ** 2} MB")

log_memory("initial load")

# Hugging Face token for authentication
token = "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"

# Load model configuration and manually add missing config attributes
model_name = "facebook/musicgen-small"  # Use smaller variants if available
config = MusicgenMelodyConfig.from_pretrained(model_name, token=token)

# Manually add the missing attributes
config.use_cache = False  # This should resolve the AttributeError you encountered

# Add 'initializer_factor' if it's missing in the configuration
if not hasattr(config, 'initializer_factor'):
    config.initializer_factor = 1.0  # Default value

# Modify configuration parameters for debugging
config.dropout = 0.1
config.layerdrop = 0.1
config.max_position_embeddings = 512  # Reduced
config.hidden_size = 128  # Smaller hidden size
config.num_codebooks = 128  # Adjusted to a smaller number for compatibility
config.scale_embedding = True
config.vocab_size = 50257
config.num_hidden_layers = 2  # Fewer layers
config.num_attention_heads = 4  # Fewer attention heads
config.attention_dropout = 0.1
config.activation_function = "gelu"
config.activation_dropout = 0.1
config.ffn_dim = 1024

log_memory("after config")

# Load model without weight tying
from transformers.models.musicgen_melody.modeling_musicgen_melody import MusicgenMelodyModel

# Override the model class to disable weight tying
class CustomMusicgenModel(MusicgenMelodyModel):
    def tie_weights(self):
        pass  # Disable weight tying
    
    def forward(self, input_ids, attention_mask=None, **kwargs):
        # Ensure the input is divisible by 1280, the total size expected by the model
        seq_len = input_ids.shape[-1]
        required_size = 1280
        
        if seq_len % required_size != 0:
            # If not, pad the input to the next multiple of required_size
            pad_size = required_size - (seq_len % required_size)
            input_ids = F.pad(input_ids, (0, pad_size))  # F is now defined

        # Proceed with the regular forward pass
        input_ids = input_ids.reshape(-1, 128, 10)  # reshape as needed
        return super().forward(input_ids, attention_mask=attention_mask, **kwargs)

# Load the model in FP32 (full precision) for reduced memory usage
model = CustomMusicgenModel.from_pretrained(model_name, config=config, token=token, torch_dtype=torch.float32)

log_memory("after model loaded")

# Tokenizer for the model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)

# Use a pipeline for text-to-audio generation
generator = pipeline("text-to-audio", model=model, tokenizer=tokenizer, device=-1)  # Use CPU to save memory

# Set model to evaluation mode to save memory
model.eval()

# Ensure proper input shape by padding to the required size
prompt = "A relaxing jazz track with piano and bass."

# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Print the shape of the input tensor before reshaping
print(f"Input tensor shape before reshaping: {input_ids.shape}")

# Generate audio based on input prompt with no_grad to save memory
with torch.no_grad():
    generated_audio = generator(prompt)

# Log generated audio output
print(f"Generated audio: {generated_audio}")

log_memory("after generation")

# Check the type of the generated audio
audio_data = generated_audio['audio']
print(f"Type of generated audio: {type(audio_data)}")

# Inspect the generated audio content in more detail
print(f"Generated audio shape before any modifications: {audio_data.shape}")
print(f"Generated audio content preview: {audio_data}")

# Check the first few values of the generated audio data for debugging
print(f"Generated audio (first 10 samples): {audio_data[:10]}")

# Check the range of values
print(f"Audio sample range: {np.min(audio_data)} to {np.max(audio_data)}")

# Now, let's use Griffin-Lim for spectrogram-to-waveform conversion (fallback)
if audio_data.size > 0:
    try:
        # Reshape the audio data to 2D (spectrogram)
        mel_spectrogram = np.squeeze(audio_data)  # Remove singleton dimensions (1, 10, 128) -> (10, 128)

        # Griffin-Lim vocoder: Convert spectrogram to waveform
        mel_spectrogram = mel_spectrogram.T  # Transpose for proper input (128 x 10 -> 10 x 128)
        waveform = librosa.istft(mel_spectrogram)  # Inverse Short-Time Fourier Transform (Griffin-Lim)

        # Normalize waveform to the [-1, 1] range
        waveform = np.clip(waveform, -1, 1)

        # Convert to int16 for WAV format
        audio_waveform = np.int16(waveform * 32767)

        # Save the audio to a WAV file
        wavfile.write("generated_music.wav", 32000, audio_waveform)  # 32000 is the sampling rate
        print("Audio saved as generated_music.wav")
    except Exception as e:
        print(f"Error during waveform generation: {e}")
else:
    print("Error: Generated audio is empty. Unable to save as WAV.")

# Cleanup
del generated_audio  # Explicitly delete the variable
gc.collect()  # Garbage collection
log_memory("after cleanup")

Okay, so I tried different approach and started using Audiocraft, which also seems to fail the task of using musicgen cause 16gb seems not enough

code:

import torch
import audiocraft
from audiocraft.models import MusicGen
import soundfile as sf
import numpy as np
import gc

# Set device to GPU if available, else use CPU
device_cpu = torch.device('cpu')
device_gpu = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model with 'small' variant (will use default device handling)
model = MusicGen.get_pretrained('small')  # Automatically places the model on the appropriate device

# Define function to generate audio in chunks and save them to disk
def generate_audio_in_chunks(model, prompt, total_duration_sec=200, chunk_duration_sec=10, sample_rate=16000):
    # Number of chunks to generate (total_duration / chunk_duration)
    num_chunks = total_duration_sec // chunk_duration_sec
    
    # Create a file to save audio (ensure to overwrite each time)
    output_path = "output_200_seconds.wav"
    
    for i in range(num_chunks):
        print(f"Generating chunk {i+1}/{num_chunks}...")
        
        with torch.no_grad():  # Disable gradient computation
            try:
                # Generate a chunk of audio
                audio_tensor = model.generate(prompt)  # Generate audio based on the prompt
                
            except RuntimeError as e:
                if "out of memory" in str(e):  # If GPU memory is full, switch to CPU
                    print("GPU memory full, switching to CPU...")
                    model.to(device_cpu)  # Move model to CPU
                    audio_tensor = model.generate(prompt)  # Run on CPU
                else:
                    raise e  # Reraise exception if it's something else
            
            # Convert tensor to numpy array
            audio_numpy = audio_tensor.numpy()

            # If the audio tensor has a batch dimension, select the first item (e.g., index 0)
            audio_numpy = audio_numpy[0]  # Use the first item in the batch
            
            # Save the generated chunk to the .wav file
            if i == 0:
                # Write the first chunk to file (overwrites the file)
                sf.write(output_path, audio_numpy, sample_rate)
            else:
                # Append the generated chunk to the file (write mode with append)
                with open(output_path, 'ab') as f:
                    sf.write(f, audio_numpy, sample_rate)
            
            # Clear memory after saving the chunk (useful for GPU)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                
            # Manually call garbage collection
            gc.collect()

        print(f"Chunk {i+1} saved to {output_path}")
    
    print("All audio chunks generated and saved successfully!")

# Generate a 200-second audio sample using a text prompt
generate_audio_in_chunks(model, "A relaxing ambient soundtrack", total_duration_sec=200, chunk_duration_sec=10)
1 Like

This worked for me. Loading with FP16 saves RAM, but it may not work well with the CPU.

import os
import psutil
import torch
import gc
from transformers import AutoProcessor, MusicgenMelodyForConditionalGeneration, MusicgenMelodyConfig
import scipy
# https://huggingface.co/docs/transformers/main/model_doc/musicgen_melody

# Function to log memory usage
def log_memory(stage=""):
    process = psutil.Process(os.getpid())
    print(f"Memory Usage after {stage}: {process.memory_info().rss / 1024 ** 2} MB")

log_memory("initial load")

# Hugging Face token for authentication
token = "TOKEN"

# Load model configuration and manually add missing config attributes
model_name = "facebook/musicgen-small"  # Use smaller variants if available
#model_name = "facebook/musicgen-melody" # For better output
config = MusicgenMelodyConfig.from_pretrained(model_name, token=token)

# Manually add the missing 'use_cache' attribute
config.use_cache = False  # This should resolve the AttributeError you encountered

# Manually add the missing initializer_factor if it's required
config.initializer_factor = 1.0  # Default value for initialization

# Modify configuration parameters for debugging
config.dropout = 0.1
config.layerdrop = 0.1
config.max_position_embeddings = 512  # Reduced
config.hidden_size = 128  # Smaller hidden size
config.num_codebooks = 128  # Adjusted to a smaller number for compatibility
config.scale_embedding = True
config.vocab_size = 50257
config.num_hidden_layers = 2  # Fewer layers
config.num_attention_heads = 4  # Fewer attention heads
config.attention_dropout = 0.1
config.activation_function = "gelu"
config.activation_dropout = 0.1
config.ffn_dim = 1024

log_memory("after config")

# Load the model
model = MusicgenMelodyForConditionalGeneration.from_pretrained(model_name, config=config, token=token).eval()

log_memory("after model loaded")

# Processor for the model
processor = AutoProcessor.from_pretrained(model_name)

# Ensure proper input shape by padding to the required size
prompt = "A relaxing jazz track with piano and bass."

input_ids = processor(
    text=[prompt],
    padding=True,
    return_tensors="pt",
).to(model.device)

# Check the shape after reshaping
print(f"Input tensor shape after reshaping: {input_ids['input_ids'].shape}")

# Generate audio based on input prompt with no_grad to save memory
with torch.no_grad():
    generated_audio = model.generate(**input_ids,  max_new_tokens=256)
    print(generated_audio)

log_memory("after generation")

# Check type of the audio data
print(f"Type of generated audio: {type(generated_audio)}")

# Save the generated audio to a file
if isinstance(generated_audio, torch.Tensor):
    sampling_rate = model.config.audio_encoder.sampling_rate
    scipy.io.wavfile.write("generated_music.wav", rate=sampling_rate, data=generated_audio.to("cpu")[0, 0].numpy())
else:
    print("Unexpected audio format, unable to save.")

# Cleanup
del generated_audio  # Explicitly delete the variable
gc.collect()  # Garbage collection
log_memory("after cleanup")

This generates crippled sounding 5 second file. It’s something, but it sounds like something under water and high pitch, not a music.

1 Like

Yes. “facebook/musicgen-melody” works but “small” is weird.