- preprocess data
def tokenize(example):
tokenizer(example["convo"],example["response"],padding='max_length',truncation=True,max_length=768)
result["labels"] = result["input_ids"]
return result
# load dataset
dataset = load_from_disk(dataset_name)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
# train_dataset, test_dataset = load_from_disk(dataset_name)
train_dataset = train_dataset.shuffle().select(range(500)) # smaller the size for t dataset to 10k
test_dataset = test_dataset.shuffle().select(range(50)) # smaller the size for test dataset to 10k
# tokenize dataset
train_dataset = train_dataset.map(tokenize, num_proc=8)
test_dataset = test_dataset.map(tokenize, num_proc=8)
# set format for pytorch
# train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
# test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
- train model
model = AutoModelForCausalLM.from_pretrained(args.model_name,use_cache=False)
tokenizer = LlamaTokenizer.from_pretrained(args.model_name,truncation=True)
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
training_args = TrainingArguments(
output_dir=ā./model',
num_train_epochs=3,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
warmup_steps=10,
evaluation_strategy="epoch",
logging_dir=fā./logsā,
learning_rate=0.0001,
save_steps = 10,
save_strategy = āsteps',
save_total_limit = 4,
gradient_accumulation_steps = 4,
fp16 = True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
tokenizer=tokenizer
)
# train model
trainer.train()
trainer.save_model(ā./model')
- inference
model_id=ādecapoda-research/llama-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,torch_dtype=torch.float16)
predictor = pipeline("text-generation", model=model, tokenizer=tokenizer,device=0)
text = āI am a llama ....ā # total about 750 tokens
ds_clf(text,max_new_tokens=128)
when the model_id is ādecapoda-research/llama-7b-hfā ,generate 128 news tokens about 6s, but when I replace the model_id with our own finetuning model, the inference time takes about 26s.
Remarksļ¼
trainingļ¼we are using sagemakerās smp distributed training method - p4d
inferenceļ¼local test - p3.2xlarge
transformers install: pip install git+https://github.com/huggingface/transformers.git
I really need help! Iāve tried every possible solution I could find, but nothing improves the inference speed