Bert embedding on GPU

model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)

model = BertModel.from_pretrained('bert-base-uncased', token=access_token).cuda()
import torch
from tqdm import tqdm

def embed_texts(texts, tokenizer, model, batch_size=256, output_file='embeddings.pt'):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_texts = len(texts)
    embeddings = []

    # Move model to the selected device
    model.to(device)

    # Initialize tqdm to track progress
    pbar = tqdm(total=num_texts, desc="Embedding texts", unit="texts")

    for i in range(0, num_texts, batch_size):
        batch_texts = texts[i:i+batch_size]
        # Tokenize batch of texts
        inputs = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        # Process batch with the model
        with torch.no_grad():
            outputs = model(**inputs)
        # Calculate embeddings (mean pooling)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        # Accumulate batch embeddings
        embeddings.append(batch_embeddings)

        # Update progress bar
        pbar.update(len(batch_texts))

    # Close progress bar
    pbar.close()
    
    # Concatenate embeddings of all batches
    embeddings = torch.cat(embeddings, dim=0)

    # Save embeddings directly as a tensor
    torch.save(embeddings, output_file)

    print(f"Embeddings saved to {output_file}")

    return embeddings

# Usage example
# Assuming `texts`, `tokenizer`, and `model` are already defined and initialized
embeddings = embed_texts(df['Post'][:100000].tolist(), tokenizer, model, batch_size=256, output_file='/content/drive/My

Hi i am runnig this code on colab with T4 GPU , i can see (7.5 GB GPU is being constantly used ) But the speed is 34,6 texts / sec , i am a beginner so not sure if it normal speed with GPU or it is still using only CPU . Ideally i think after execution GPU usage should come back to normal , but after the script completed it remained at 7.5GB . What am i doing wrong . Thanks !!