The training was interrupted. When I restarted it, No output in the terminal. The GPU stat constantly shows 3 MB and 420 MB memory usage by both A00s. Is training stuck??
import torch
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
max_seq_length = 2048
dtype = None
load_in_4bit = True
from utils.util import *
from transformers import TrainingArguments
from trl import SFTTrainer
from transformers import AutoTokenizer
from datasets import load_dataset
from accelerate import PartialState
device_string = PartialState().process_index
args=TrainingArguments(
per_device_train_batch_size = 8,
per_device_eval_batch_size =8,
gradient_accumulation_steps = 4,
warmup_steps = 5,
num_train_epochs = 1,
save_steps = 100,
learning_rate = 2e-4,
fp16 = False,
bf16 = True,
logging_steps = 2,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
resume_from_checkpoint = 'outputs/checkpoint-7900',
do_eval = True,
eval_strategy = "steps",
eval_steps = 1000,
save_total_limit = 5,
gradient_checkpointing_kwargs={'use_reentrant':False},
tf32=True,
gradient_checkpointing =True,
logging_dir = 'logs',
log_level = 'info'
)
tokenizer = AutoTokenizer.from_pretrained("tokenizer2")
tokenizer.padding_side = 'right'
model = Load_Model.from_pretrained(
"training1/model1",
max_seq_length = 2048,
device_map = {'':device_string},
dtype = None,
load_in_4bit = True,
)
model.resize_token_embeddings(len(tokenizer))
# model.save_pretrained('base_model2')
from peft import LoraConfig
peft_config = LoraConfig(
r=64,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
"embed_tokens"],
lora_alpha=16,
lora_dropout=0.05,
bias="none",
use_rslora = True,
task_type="CAUSAL_LM",
modules_to_save=["embed_tokens"]
)
# from peft import get_peft_model,prepare_model_for_kbit_training
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, peft_config)
print('trainable parameters: ',sum(p.numel() for p in model.parameters() if p.requires_grad))
dataset = load_dataset('csv', data_files='llm_prompts2.csv', split='train')
dataset=dataset.train_test_split(test_size=0.01, seed = 47, shuffle=True)
print("dataset loaded. Starting training.")
# Train
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset['train'],
eval_dataset = dataset['test'],
dataset_text_field = "text",
max_seq_length = max_seq_length,
dataset_num_proc = 16,
packing = False, # Can make training 5x faster for short sequences.
args = args,
peft_config=peft_config,
)
trainer_stats = trainer.train(
resume_from_checkpoint=True
)