I’m doing inference (not training) over a loop of puzzle queries. Even on an A6000 with 48Gb of VRAM I run out of CUDA memory after about 10 loops (I need to do 400). It seems that VRAM is not being released at the end of the loop. I’ve tried model.eval(), with torch.no_grad():, gc, and torch.cuda.empty_cache() with no luck. I don’t think setting max_split_size_mb will help since the VRAM using just increases with each loop. Has anyone experience this? Any ideas what to do?
OutOfMemoryError: CUDA out of memory. Tried to allocate 33.94 GiB (GPU 0; 47.54 GiB total capacity; 22.99 GiB already allocated; 23.78 GiB free; 23.37 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
peft_model_id = gdrive_path + "../adapters/Mistral-7B-Instruct_finetuned_on_20000_core_puzzles"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
quantization_config=bnb_config,
return_dict=True,
load_in_4bit=True,
device_map={"":0})
if download_base_tokenizer:
base_model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
else:
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)
import json
import os
import re
import ast
import gc
directory = "../data/ARC/evaluation/"
total_files = 0
num_correct = 0
for root, dirs, files in os.walk(directory):
for name in files:
torch.cuda.empty_cache()
gc.collect()
file_path = os.path.join(root, name)
with open(file_path, "r") as f:
data = json.load(f)
train_tasks = data["train"]
test_tasks = data["test"]
train_input_grids = []
train_output_grids = []
test_input_grids = []
test_output_grids = []
for train_task in train_tasks:
ti = train_task["input"]
train_input_grids.append(ti)
to = train_task["output"]
train_output_grids.append(to)
for test_task in test_tasks:
ti = test_task["input"]
test_input_grids.append(ti)
to = test_task["output"]
test_output_grids.append(to)
instruction = create_instruction(
train_input_grids, train_output_grids, test_input_grids
)
query = '<s>[INST] ' + instruction + ' [/INST] '
encodeds = tokenizer(query, return_tensors="pt", add_special_tokens=False)
model_inputs = encodeds.to("cuda")
model.eval()
with torch.no_grad():
generated_ids = model.generate(**model_inputs, max_new_tokens=4096, do_sample=False, pad_token_id=tokenizer.eos_token_id)
decoded = tokenizer.batch_decode(generated_ids)
result = decoded[0]
result = result[len(query) :]
match = re.search(r"Test_Output_1=\[\[.*?\]\]", result)
if match:
array_string = match.group(0).replace("Test_Output_1=", "")
array = ast.literal_eval(array_string)
print("Result")
print(result)
print("")
else:
print("No match found")
print(result)
print("")
array = []
ground_truth = test_output_grids[0]
print("Ground Truth")
for row in ground_truth:
print(row)
print("Prediction" )
for row in array:
print(row)
if array == ground_truth:
print("...................................Correct!!!!!!!!!!!!!\n")
num_correct += 1
else:
print("...................................Incorrect.\n")
del query
del encodeds
del model_inputs
del generated_ids
del decoded
del result
del array
del ground_truth
del train_input_grids
del train_output_grids
del test_input_grids
del test_output_grids
del train_tasks
del test_tasks
del instruction
del file_path
del data
gc.collect()
torch.cuda.empty_cache()
gc.collect()