model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model = BertModel.from_pretrained('bert-base-uncased', token=access_token).cuda()
import torch
from tqdm import tqdm
def embed_texts(texts, tokenizer, model, batch_size=256, output_file='embeddings.pt'):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_texts = len(texts)
embeddings = []
# Move model to the selected device
model.to(device)
# Initialize tqdm to track progress
pbar = tqdm(total=num_texts, desc="Embedding texts", unit="texts")
for i in range(0, num_texts, batch_size):
batch_texts = texts[i:i+batch_size]
# Tokenize batch of texts
inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
# Process batch with the model
with torch.no_grad():
outputs = model(**inputs)
# Calculate embeddings (mean pooling)
batch_embeddings = outputs.last_hidden_state.mean(dim=1)
# Accumulate batch embeddings
embeddings.append(batch_embeddings)
# Update progress bar
pbar.update(len(batch_texts))
# Close progress bar
pbar.close()
# Concatenate embeddings of all batches
embeddings = torch.cat(embeddings, dim=0)
# Save embeddings directly as a tensor
torch.save(embeddings, output_file)
print(f"Embeddings saved to {output_file}")
return embeddings
# Usage example
# Assuming `texts`, `tokenizer`, and `model` are already defined and initialized
embeddings = embed_texts(df['Post'][:100000].tolist(), tokenizer, model, batch_size=256, output_file='/content/drive/My
Hi i am runnig this code on colab with T4 GPU , i can see (7.5 GB GPU is being constantly used ) But the speed is 34,6 texts / sec , i am a beginner so not sure if it normal speed with GPU or it is still using only CPU . Ideally i think after execution GPU usage should come back to normal , but after the script completed it remained at 7.5GB . What am i doing wrong . Thanks !!