Finetuned llama7b model is 5x slower than hugingface raw model

  1. preprocess data
def tokenize(example):
    tokenizer(example["convo"],example["response"],padding='max_length',truncation=True,max_length=768)
    result["labels"] = result["input_ids"]
    return result
# load dataset
dataset = load_from_disk(dataset_name)
train_dataset = dataset["train"]
test_dataset = dataset["test"]
# train_dataset, test_dataset = load_from_disk(dataset_name)
train_dataset = train_dataset.shuffle().select(range(500)) # smaller the size for t dataset to 10k 
test_dataset = test_dataset.shuffle().select(range(50)) # smaller the size for test dataset to 10k 

# tokenize dataset
train_dataset = train_dataset.map(tokenize, num_proc=8)
test_dataset = test_dataset.map(tokenize, num_proc=8)

# set format for pytorch
# train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
# test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
  1. train model
model = AutoModelForCausalLM.from_pretrained(args.model_name,use_cache=False)
tokenizer = LlamaTokenizer.from_pretrained(args.model_name,truncation=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    training_args = TrainingArguments(
        output_dir=ā€˜./model',
        num_train_epochs=3,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        warmup_steps=10,
        evaluation_strategy="epoch",
        logging_dir=fā€./logsā€,
        learning_rate=0.0001,
        save_steps = 10,
        save_strategy = ā€˜steps',
        save_total_limit = 4,
        gradient_accumulation_steps = 4,
        fp16 = True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer
    )

    # train model
    trainer.train()

trainer.save_model(ā€˜./model')
  1. inference
model_id=ā€œdecapoda-research/llama-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,torch_dtype=torch.float16)
predictor = pipeline("text-generation", model=model, tokenizer=tokenizer,device=0)
text = ā€˜I am a llama ....ā€™  # total about 750 tokens
ds_clf(text,max_new_tokens=128)

when the model_id is ā€œdecapoda-research/llama-7b-hfā€ ,generate 128 news tokens about 6s, but when I replace the model_id with our own finetuning model, the inference time takes about 26s.

Remarksļ¼š
trainingļ¼šwe are using sagemakerā€™s smp distributed training method - p4d
inferenceļ¼šlocal test - p3.2xlarge
transformers install: pip install git+https://github.com/huggingface/transformers.git

I really need help! Iā€™ve tried every possible solution I could find, but nothing improves the inference speed