@valhalla , I hope you are well. sorry, I run the code pretty easy with this command and then checked the nvidia-smi all defined gpus are working (means 1,2,3). is it ok now? I shouldn’t do anything more inside the code? this is my code, can I trust the final model? how did you call your function?
CUDA_VISIBLE_DEVICES=“1,2,3” python casesummary_resolution_GPT_Neo_GPU_V5-125M-Trainer_v22.py
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM, IntervalStrategy
from sklearn.model_selection import train_test_split
torch.manual_seed(42)
pretrained_model = '/home//GPT-NEO-125M/'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, bos_token='<|startoftext|>',
eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained(pretrained_model).cuda()
print(torch.cuda.current_device())
model.resize_token_embeddings(len(tokenizer))
descriptions = DataWhole_1
# max_length = max([len(tokenizer.encode(description)) for description in descriptions])
max_length=350
print("Max length: {}".format(max_length))
class NetflixDataset(Dataset):
def __init__(self, txt_list, tokenizer, max_length):
self.input_ids = []
self.attn_masks = []
self.labels = []
for txt in txt_list:
encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
max_length=max_length, padding="max_length")
self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.attn_masks[idx]
dataset = NetflixDataset(descriptions, tokenizer, max_length=max_length)
train_dataset, val_dataset = train_test_split(dataset,test_size=.1,random_state=42,shuffle=False)
training_args = TrainingArguments(output_dir=Results_Path, learning_rate=5e-5,num_train_epochs=16,evaluation_strategy="steps", logging_strategy="steps",save_strategy="steps",save_steps=10000,seed=42,load_best_model_at_end=True,logging_steps=10000,
report_to="tensorboard",per_device_train_batch_size=4,eval_steps=10000,save_total_limit=2,per_device_eval_batch_size=4,
warmup_steps=100, weight_decay=0.01, logging_dir=Results_Path)
Trainer(model=model, args=training_args, tokenizer=tokenizer,train_dataset=train_dataset,
eval_dataset=val_dataset,data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
'attention_mask': torch.stack([f[1] for f in data]),
'labels': torch.stack([f[0] for f in data])}).train()