Hey,
I’m currently working on integrating NEFTune with the Hugging Face Trainer for a project using the transformers library. I’ve set the neftune_noise_alpha in the TrainingArguments, but I’m encountering issues with NEFTune not adding noise to the embeddings during training.
Here’s a brief overview of what I’ve done:
- Using AutoModelForCausalLM and AutoTokenizer to load the model and tokenizer.
- Configured the model with LoRA using peft.
- Set up TrainingArguments with neftune_noise_alpha.
- Initialized the Trainer with the model and dataset.
- Verified NEFTune hook registration, but hooks are not being registered automatically.
Despite the setup, the training outputs are not different, indicating that no noise is being added. The noise magnitude is also outside the expected range.
I’m using transformers version 4.48.2. Any insights or suggestions on how to resolve this issue would be greatly appreciated!
Thank you!
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import torch
from datasets import Dataset
import os
from transformers.trainer import neftune_post_forward_hook
# Disable all attention-related features
os.environ["XFORMERS_DISABLED"] = "1"
os.environ["FORCE_FLASH_ATTN_OFF"] = "1"
os.environ["DISABLE_FLASH_ATTENTION"] = "1"
def verify_neftune_hook(model):
"""Verify if NEFTune hook is properly registered"""
if not hasattr(model, "get_input_embeddings"):
print("Model does not have get_input_embeddings method")
return False
embeddings = model.get_input_embeddings()
if not hasattr(embeddings, "_forward_hooks"):
print("Embedding layer does not have forward hooks")
return False
if len(embeddings._forward_hooks) == 0:
print("No forward hooks registered on embedding layer")
return False
print("NEFTune hook verification passed!")
return True
# Load model using standard HF implementation
print("\nLoading model and tokenizer...")
model = AutoModelForCausalLM.from_pretrained(
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
trust_remote_code=True,
use_cache=False,
torch_dtype=torch.float32,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
trust_remote_code=True,
)
# Configure model
if hasattr(model, "config"):
model.config.use_cache = False
model.config.pretraining_tp = 1
model.config.use_memory_efficient_attention = False
model.config.use_flash_attention_2 = False
model.config._attn_implementation = "eager"
print("\nApplying LoRA configuration...")
# Add LoRA configuration
lora_config = LoraConfig(
r=16,
lora_alpha=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
print("\nPreparing dataset...")
# Create a small dummy dataset and tokenize it
texts = ["Hello, world!"] * 4
tokenized_data = tokenizer(
texts,
padding=True,
truncation=True,
max_length=1024,
return_tensors="pt"
)
# Create dataset with the expected format
dataset = Dataset.from_dict({
"input_ids": tokenized_data["input_ids"],
"attention_mask": tokenized_data["attention_mask"],
"labels": tokenized_data["input_ids"].clone(),
})
print("\nSetting up training arguments with NEFTune...")
# Ensure NEFTune is activated through TrainingArguments
training_args = TrainingArguments(
output_dir="test_trainer",
num_train_epochs=1,
per_device_train_batch_size=1,
neftune_noise_alpha=0.4, # Enable NEFTune
gradient_checkpointing=False,
optim="adamw_torch",
remove_unused_columns=False,
logging_steps=1, # Log every step
)
# Initialize trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
)
print("\nTesting NEFTune through Hugging Face Trainer...")
# Ensure the neftune_noise_alpha attribute is set on the embedding layer
embeddings = model.get_input_embeddings()
if not hasattr(embeddings, 'neftune_noise_alpha'):
embeddings.neftune_noise_alpha = training_args.neftune_noise_alpha
# Verify NEFTune hook registration
print("\nVerifying NEFTune hook registration...")
verify_neftune_hook(model) # Just verify, no manual registration
print("\nStarting training...")
# Run training
trainer.train()
print("\nPerforming manual verification of NEFTune...")
# Manual verification of NEFTune
model.train()
embeddings = model.get_input_embeddings()
embeddings.train()
# Test during training mode
print("\nTesting NEFTune in training mode...")
with torch.no_grad():
sample_input = torch.ones(1, 10, dtype=torch.long, device=model.device)
emb1 = embeddings(sample_input)
emb2 = embeddings(sample_input)
training_diff = not torch.allclose(emb1, emb2)
print("Training mode -> outputs different:", training_diff)
# Test during eval mode print("\nTesting NEFTune in eval mode...")
model.eval()
embeddings.eval()
with torch.no_grad():
emb_eval1 = embeddings(sample_input)
emb_eval2 = embeddings(sample_input)
eval_same = torch.allclose(emb_eval1, emb_eval2)
print("Eval mode -> outputs same:", eval_same)
# Initialize magnitude_check to False by default
magnitude_check = False
# Check noise magnitude
if not torch.allclose(emb1, emb2):
print("\nChecking noise magnitude...")
noise_magnitude = (emb1 - emb2).abs().max().item()
expected_magnitude = 0.4 / (embeddings.weight.shape[1] ** 0.5) # Using our neftune_noise_alpha
print(f"Noise magnitude check: actual={noise_magnitude:.2e}, expected={expected_magnitude:.2e}")
magnitude_check = 0.5 * expected_magnitude <= noise_magnitude <= 1.5 * expected_magnitude
print("Magnitude within expected range:", magnitude_check)
# Final verification
print("\nFinal NEFTune Verification:")
all_checks_passed = training_diff and eval_same and magnitude_check
print("All checks passed:", all_checks_passed)
if not all_checks_passed:
print("Failed checks:")
if not training_diff:
print("- Training outputs are not different (no noise being added)")
if not eval_same:
print("- Eval outputs are not the same (noise being added in eval mode)")
if not magnitude_check:
print("- Noise magnitude is outside expected range")
Output:
Loading model and tokenizer...
Applying LoRA configuration...
Preparing dataset...
Setting up training arguments with NEFTune...
Testing NEFTune through Hugging Face Trainer...
Verifying NEFTune hook registration...
No forward hooks registered on embedding layer
Starting training...
{'loss': 2.9122, 'grad_norm': 18.854711532592773, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.25}
{'loss': 2.65, 'grad_norm': 19.004514694213867, 'learning_rate': 2.5e-05, 'epoch': 0.5}
{'loss': 2.4033, 'grad_norm': 11.855709075927734, 'learning_rate': 1.25e-05, 'epoch': 0.75}
{'loss': 2.1365, 'grad_norm': 13.357922554016113, 'learning_rate': 0.0, 'epoch': 1.0}
{'train_runtime': 2.0219, 'train_samples_per_second': 1.978, 'train_steps_per_second': 1.978, 'train_loss': 2.525495231151581, 'epoch': 1.0}
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:02<00:00, 1.98it/s]
Performing manual verification of NEFTune...
Testing NEFTune in training mode...
Training mode -> outputs different: False
Testing NEFTune in eval mode...
Eval mode -> outputs same: True
Final NEFTune Verification:
All checks passed: False
Failed checks:
- Training outputs are not different (no noise being added)
- Noise magnitude is outside expected range