I am trying to create embedded ngs from a large amount of text. I am doing it paragraph by paragraph in a loop. I always run out of gpu memory. Any idea why this is happening?
Code looks like this
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xxl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xxl", device_map="auto", torch_dtype=torch.float16)
def getSentenceEmbedding(sentenceText, languageModel, modelTokenizer):
sentence_tokens = modelTokenizer(sentenceText, return_tensors="pt")
sentence_input_ids = sentence_tokens.input_ids #.to('cuda')
encodings = languageModel.encoder(input_ids=sentence_input_ids, attention_mask=sentence_tokens.attention_mask, return_dict=True)
del sentence_input_ids
del sentence_tokens
gc.collect()
torch.cuda.empty_cache()
return torch.mean(encodings.last_hidden_state, dim=1)
Embeddedings=[]
for paragraph in text:
torch.cuda.empty_cache()
Embeddedings.append((getSentenceEmbedding(parag,model,tokenizer)))