Generation begins with eos_token and continue generating a token I added until the last token.
I intended to fine_tune the model using QLORA but encountered this problem, and even more, when I used only the quantized version, it funnily gave the same results !!
NOTE : I modified the tokenizer and embeddings of the model like this before saving them to be used after in fine tuning Lora :
Tokenizer :
tokenizer = AutoTokenizer.from_pretrained("core42/jais-13b",trust_remote_code=True)
tokenizer.padding_side = 'right'
print(tokenizer.eos_token)
tokenizer.add_special_tokens({'pad_token':'<|pad|>'})
print(tokenizer.pad_token)
hidden_tokens = [f'<hidden_{i}>' for i in range(0, 301)]
number_tokens = [f'<number_{i}>' for i in range(0, 301)]
special_tokens_1 = hidden_tokens + number_tokens
tokenizer.add_tokens(special_tokens_1)
special_tokens_2 = ["###Prompt:", "###Completion:"]
tokenizer.add_tokens(special_tokens_2)
Model
model = AutoModelForCausalLM.from_pretrained(
"Model_de_base/models--core42--jais-13b/snapshots/cce6dfc87639f6146fc15333bd326db236497879",
device_map=device_map,
quantization_config=bnb_config
)
model.resize_token_embeddings(len(tokenizer))
embedding_layer = model.get_input_embeddings()
average_embedding = torch.mean(embedding_layer.weight.data, dim=0)
# Define noise scales
group_noise_scale = 0.05 # Differentiates between 'hidden' and 'number' groups
token_noise_scale = 0.01 # Differentiates between tokens within a group
# Add group-level noise to differentiate 'hidden' and 'number' tokens
hidden_group_noise = torch.randn_like(average_embedding) * group_noise_scale
number_group_noise = torch.randn_like(average_embedding) * group_noise_scale
# Initialize 'hidden' token embeddings
for token in hidden_tokens:
token_id = tokenizer.convert_tokens_to_ids(token)
token_noise = torch.randn_like(average_embedding) * token_noise_scale
embedding_layer.weight.data[token_id] = average_embedding + hidden_group_noise + token_noise
# Initialize 'number' token embeddings
for token in number_tokens:
token_id = tokenizer.convert_tokens_to_ids(token)
token_noise = torch.randn_like(average_embedding) * token_noise_scale
embedding_layer.weight.data[token_id] = average_embedding + number_group_noise + token_noise
for token in special_tokens_2:
token_id = tokenizer.convert_tokens_to_ids(token)
random_embedding = torch.randn_like(embedding_layer.weight[token_id])
embedding_layer.weight.data[token_id] = random_embedding.clone().detach()
model.config.vocab_size = len(tokenizer)
model.lm_head = torch.nn.Linear(model.config.hidden_size, len(tokenizer))
I’ll never forget your help.