regisss,
Here is the code. To run with CLI, change cliBool = False to True and change the location path âmodel_saveâ. Comparison of size is then done manually by verifying the size of files created at path model_save.
The ultimate objective is not Dolly but something like a Flan-T5 XXX. I am exploring the feasibility of using ORT. I attempted earlier with OpenVINO, but their implementation crashed. It is now being evaluated by Intel developers after they confirmed the issue with their own sample code.
âââ
#!/usr/bin/env python
#standard imports
import os
import subprocess
from pathlib import Path
#third-party imports
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForCausalLM
from optimum.pipelines import pipeline
import torch
from tensorflow.python.ops.initializers_ns import variables
#from optimum.onnxruntime import OptimizationConfig, ORTOptimizer #deferred to optimization
cliBool = False
#getting from huggingface.co
model_remote = âdatabricks/dolly-v2-3bâ
model_path = Path(â~/.cache/huggingface/hub/modelsâdatabricksâdolly-v2-3bâ).expanduser()
if model_path.exists():
print(â************* 1: Getting local PyTorch Modelâ)
#⌠from local cache; process does not expand user path; also, needs to point at config.json file location
model_path = model_path / âsnapshots/f6c9be08f16fe4d3a719bee0a4a7c7415b5c65dfâ
#saving to: /home/jellybean/.cache/huggingface/hub/modelsâdatabricksâdolly-v2-3b/snapshots/f6c9be08f16fe4d3a719bee0a4a7c7415b5c65df/optimum/onnx_rt
model_save = model_path / âoptimum/onnx_rt_optimizedâ
else:
print(â************* 1: Getting Hugging Face PyTorch Modelâ)
model_path = model_remote
#mannually move it to cache snapshot upon created
model_save = Path(â~/.cache/huggingface/hub/â).expanduser() / âoptimum/onnx_rtâ
GPU not enough memory: the following reports before onnx conversion starts
#mem = torch.cuda.mem_get_info()
#print(f"CUDA Memory - Available: {mem[0]} Total: {mem[1]}â)
#print(f"CUDA Memory - Total: {torch.cuda.get_device_properties(0).total_memory} Reserved: {torch.cuda.memory_reserved(0)} Allocated: {torch.cuda.memory_allocated(0)}â)
#print(f"{torch.cuda.memory_summary(device=0, abbreviated=True)}")
#check file exist so we donât convert again
config = model_save / âconfig.jsonâ
âââtorch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 76.00 MiBâŚ
⌠(GPU 0; 5.79 GiB total capacity; 5.26 GiB already allocated; 69.50 MiB free;âŚ
⌠5.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memoryâŚ
⌠try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONFâŚ
⌠but os.environ[âPYTORCH_CUDA_ALLOC_CONFâ] = âmax_split_size_mb:21â still gets OutOfMemoryError
cli = âoptimum-cli export onnx -m {model_path} --cache_dir {model_save} --task text-generation
cli += fââframework pt --device cuda --optimize O4 --batch_size 32 {model_save}â
Cause: --device cuda --optimize O4
âââ
if not config.exists():
if cliBool:
print(â************* 2: CLI-ONNX Convert/Optimizeâ)
#switching to CPU; need to specify --task if not from huggingface; optimizing to Level O3âŚ
#⌠export dolly to onnx: > optimum-cli export onnx --help
cli = fâoptimum-cli export onnx -m {model_path} --task text-generation-with-past --framework pt --no-post-process --optimize O3 {model_save}â
with subprocess.Popen([cli], shell=True, stdout=subprocess.PIPE, stdin=subprocess.PIPE) as proc:
_, _ = proc.communicate()
else:
print(â************* 2: Programmatic-ONNX Converte/Optimize ONNXâ)
from optimum.onnxruntime import AutoOptimizationConfig, ORTOptimizer
print("************* 2.a")
model = ORTModelForCausalLM.from_pretrained(model_path, export=True)
print("************* 2.b")
optimizer = ORTOptimizer.from_pretrained(model)
print("************* 2.c")
#optimizing to Level O3
optimization_config = AutoOptimizationConfig.O3()
print(optimization_config)
raise SystemExit
print("************* 2.d")
optimizer.optimize(save_dir=model_save, optimization_config=optimization_config)
else: print(â************* 2: Skipping ONNX Convertion/Optimizationâ)
#inference using pt to onnx model
print(â************* 3: ONNX Model Inferenceâ)
tokenizer = AutoTokenizer.from_pretrained(model_save)
model_ort = ORTModelForCausalLM.from_pretrained(model_save)
ort_pipe = pipeline(âtext-generationâ, model=model_ort, tokenizer=tokenizer, accelerator=âortâ, framework=âptâ, device=-1, model_kwargs={âload_in_8bitâ: True})
#try 4 different prompts
prompt1 = [âExplain to me what is love.â, âYou are an idiotâ] #produces a list of two lists with dicts
prompt2 = [âExplain to me what is hate.â] #produces a list of one list with dict
prompt3 = âDo you love yourself?.â #produces a list with a dict
prompt4 = âI hate you!â #produces a list with a dict
prompts = [prompt1, prompt2, prompt3, prompt4]
for prompt in prompts:
res = ort_pipe(prompt)
print(f"Prompt: {prompt}â)
for item in res:
next_item = item
while isinstance(next_item, list): next_item = next_item[0] #loop until no list (looking for dict)
print(fâ\n{next_item[âgenerated_textâ]}")
else: pass
else: pass
âââ