Hello, so I am trying to use carptriever-1 to embed batches of documents (>800-1000) usually, and even trying to embed 50 documents takes up 10+ GB of memory with the code below. Any advice on how I can resolve this issue?
from transformers import AutoTokenizer, AutoModel
import torch
device = "cpu" #issue occurs with CPU or cuda
model = AutoModel.from_pretrained("CarperAI/carptriever-1", add_pooling_layer=False)
tokenizer = AutoTokenizer.from_pretrained("CarperAI/carptriever-1")
def mean_pooling(token_embeddings, mask):
token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
return sentence_embeddings
class CarptrieverInstance:
def __init__(self, query, documents):
self.query = query
self.documents = documents
self.sentences = [query] + documents
self.embeddings = self.embed()
self.scores = self.search(self.embeddings)
def embed(self):
chunks = [self.documents[i:i + 40] for i in range(0, len(self.documents), 40)]
total_embeddings = []
for chunk in chunks:
inputs = tokenizer(chunk, padding=True, truncation=True, return_tensors='pt')
outputs = model(**inputs)
embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
total_embeddings.append(embeddings)
del inputs, outputs, embeddings
del inputs, outputs, embeddings # trying to clear some memory here, appears to have no effect
return torch.cat([total_embeddings[0]] + total_embeddings[1:], dim=0).cpu().tolist()
def search(self, embeddings):
query_embedding, sentence_embeddings = embeddings[0], embeddings[1:]
scores = (query_embedding @ sentence_embeddings.transpose(0, 1)).cuda().tolist()
sentence_score_pairs = sorted(zip(self.sentences[1:], scores), reverse=True)
sentence_score_pairs.sort(key=lambda x: x[1], reverse=True)
return sentence_score_pairs