CUDA out of memory while doing inference in a loop

I’m doing inference (not training) over a loop of puzzle queries. Even on an A6000 with 48Gb of VRAM I run out of CUDA memory after about 10 loops (I need to do 400). It seems that VRAM is not being released at the end of the loop. I’ve tried model.eval(), with torch.no_grad():, gc, and torch.cuda.empty_cache() with no luck. I don’t think setting max_split_size_mb will help since the VRAM using just increases with each loop. Has anyone experience this? Any ideas what to do?

OutOfMemoryError: CUDA out of memory. Tried to allocate 33.94 GiB (GPU 0; 47.54 GiB total capacity; 22.99 GiB already allocated; 23.78 GiB free; 23.37 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

peft_model_id = gdrive_path + "../adapters/Mistral-7B-Instruct_finetuned_on_20000_core_puzzles"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                                    quantization_config=bnb_config,
                                                    return_dict=True,
                                                    load_in_4bit=True,
                                                    device_map={"":0})

if download_base_tokenizer:
    base_model_name = "mistralai/Mistral-7B-Instruct-v0.1"
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
else:
    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

tokenizer.pad_token = tokenizer.eos_token

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)


import json
import os
import re
import ast
import gc

directory = "../data/ARC/evaluation/"
total_files = 0
num_correct = 0
for root, dirs, files in os.walk(directory):
  for name in files:
    torch.cuda.empty_cache()
    gc.collect()

    file_path = os.path.join(root, name)

    with open(file_path, "r") as f:
        data = json.load(f)
    train_tasks = data["train"]
    test_tasks = data["test"]

    train_input_grids = []
    train_output_grids = []
    test_input_grids = []
    test_output_grids = []

    for train_task in train_tasks:
        ti = train_task["input"]
        train_input_grids.append(ti)

        to = train_task["output"]
        train_output_grids.append(to)

    for test_task in test_tasks:
        ti = test_task["input"]
        test_input_grids.append(ti)

        to = test_task["output"]
        test_output_grids.append(to)

    instruction = create_instruction(
        train_input_grids, train_output_grids, test_input_grids
    )

    query = '<s>[INST] ' + instruction + ' [/INST] '
    encodeds = tokenizer(query, return_tensors="pt", add_special_tokens=False)
    model_inputs = encodeds.to("cuda")

    model.eval()
    with torch.no_grad():
      generated_ids = model.generate(**model_inputs, max_new_tokens=4096, do_sample=False, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.batch_decode(generated_ids)
    result = decoded[0]
    result = result[len(query) :]

    match = re.search(r"Test_Output_1=\[\[.*?\]\]", result)
    if match:
        array_string = match.group(0).replace("Test_Output_1=", "")
        array = ast.literal_eval(array_string)
        print("Result")
        print(result)
        print("")
    else:
        print("No match found")
        print(result)
        print("")
        array = []

    ground_truth = test_output_grids[0]

    print("Ground Truth")
    for row in ground_truth:
        print(row)
    print("Prediction" )
    for row in array:
        print(row)
    
    if array == ground_truth:
        print("...................................Correct!!!!!!!!!!!!!\n")
        num_correct += 1
    else:
        print("...................................Incorrect.\n")

    del query
    del encodeds
    del model_inputs
    del generated_ids
    del decoded
    del result
    del array
    del ground_truth
    del train_input_grids
    del train_output_grids
    del test_input_grids
    del test_output_grids
    del train_tasks
    del test_tasks
    del instruction
    del file_path
    del data
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()
1 Like

I have the same problem, it runs well with the first few hundred inferences but would encounter CUDA out of memory in the middle somewhere. They definitely did not release space but it is very hard for us to check. Llama-7B does not have such issues.

Facing same issue, have you found the solution ?

I have not.

Facing the same issue, did anyone resolve this?

I was able to resolve this, you can try gpu - Deploying LLM on Sagemaker Endpoint - CUDA out of Memory - Stack Overflow