Hi everyone,
I am willing to finetune a specific model for Sentence Classification : BAAI/bge-multilingual-gemma2 and I have multiple question reguarding the usage of PEFT (after having read documentation and different forum posts)
I want to apply PEFT to this base model to extract embeddings (using it as a feature extractor) and warp it in a PyTorch Module wich will contain a classification head (among other things). And then start finetuning.
Here is my code:
[...]
#---------------------------------------------------------------------------------
def train_model(model, dataloader, valid_dataloader, optimizer, scheduler = None, num_epochs=5, device="cuda"):
model = model.to(device)
model.train()
for epoch in range(num_epochs):
total_loss = 0
model.train()
for batch in tqdm(dataloader, total=len(dataloader), unit='row'):
optimizer.zero_grad()
logits = model(
input_ids=batch['input_ids'].to(device),
attention_mask=batch['attention_mask'].to(device)
)
# One-hot labels
labels = batch['label'].to(device)
loss = nn.CrossEntropyLoss()(logits, labels)
loss.backward()
optimizer.step()
if scheduler is not None:
scheduler.step()
total_loss += loss.item()
metrics = evaluate_model(model, valid_dataloader, device=device)
print(f"Trainning Epoch {epoch + 1}, Accumulated Train Loss: {total_loss / len(dataloader)}")
print(f"Eval : Valid Loss: {metrics['loss']}, Valid Accuracy : {metrics['accuracy']}"
#-------------------------------------------------------------------
class PreferencePredictionModel(nn.Module):
def __init__(self, gemma_model, num_classes=2):
super(PreferencePredictionModel, self).__init__()
# Load transformer model
self.gemma_model = gemma_model
transformer_hidden_size = gemma_model.config.hidden_size
# Fully connected layers for features
#self.feature_fc = nn.Linear(feature_dim, 64)
# Final classification layer
self.classifier = nn.Sequential(
#nn.Linear(transformer_hidden_size + 64, 128),
nn.Linear(transformer_hidden_size, 128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, num_classes)
)
def forward(self, input_ids, attention_mask, features=None):
outputs = self.gemma_model(input_ids=input_ids, attention_mask=attention_mask)
embeddings = last_token_pool(outputs.last_hidden_state, attention_mask)
# normalize embeddings ????
#embeddings = F.normalize(embeddings, p=2, dim=1)
# Feature processing
#feature_output = self.feature_fc(features)
# Concatenate and classify
combined = embeddings
logits = self.classifier(combined)
return logits
#---------------------------------------------------------------------------------
[...]
lora_config = LoraConfig(
r=config.lora_r,
lora_alpha=config.lora_alpha,
# only target self-attention
target_modules=["q_proj", "k_proj", "v_proj"],
#layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
lora_dropout=config.lora_dropout,
bias=config.lora_bias,
task_type=TaskType.FEATURE_EXTRACTION, #SEQ_CLS
)
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModel.from_pretrained('BAAI/bge-multilingual-gemma2',
torch_dtype=torch.float16,
device_map="auto",
quantization_config=quantization_config
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
lora_model = get_peft_model(model, lora_config)
predictionModel = PreferencePredictionModel(gemma_model=lora_model, num_classes=2)
optimizer = optim.Adam(predictionModel.parameters())
train_model(predictionModel, dataloader_train, dataloader_valid, optimizer, scheduler=None, device=device, num_epochs=config.n_epochs)
This actually runs but very slowly as i want to be able to save/load the model before uploading it for further trainning.
1st Question
Am i allowed to warp a PEFT-tuned model inside a Pytorch Module ? Do i have to use HuggingFaceâs Trainner class to train my model or can i still use the classic custom batch looping function ?
2nd Question
I have seen multiple exemple of how to save/load a PEFT-tuned model saving and loading base model and adapters. But in this case i have tried to use pytorch saving method :
torch.save({
'epoch': 0,
'model_state_dict': predictionModel.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
}, f'../CheckPoints/PreferencePredictionModel.pt')
and loading it by recreating a mirror like model and then uploading checkpoints resulted in multiple errors :
âUnexpected key(s) in state_dict: âgemma_model.base_model.model.layers.0.self_attn.q_proj.base_layer.weight.absmaxâ[âŚ]â
Is there a way to properly save a warped PEFT model ?
Thank you