Getting issue on query_engine for llm_completion_callback

I am implementing a Rag using Hugging Face Model, i have 0.10 version of LLama Index , facing error on query engine

import torch

llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=256,
generate_kwargs={“temperature”:0.5,“do_sample”:False},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name=‘meta-llama/Llama-2-7b-chat-hf’,
model_name=‘meta-llama/Llama-2-7b-chat-hf’,
device_map=‘auto’,
model_kwargs={‘torch_dtype’:torch.float16},
)

embed_model=LangchainEmbedding(HuggingFaceEmbeddings(model_name=“sentence-transformers/all-mpnet-base-v2”))
from llama_index.core import Settings

from llama_index.core.node_parser import SentenceSplitter

Settings.llm = llm

Settings.embed_model = embed_model

Settings.node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)
index = VectorStoreIndex.from_documents(documents, show_progress=True)

query_engine = index.as_query_engine()

##########Error In Execution##################
response = query_engine.query(“what is a wide ball”)
##########################################


ValueError Traceback (most recent call last)
in <cell line: 2>()
1 query_engine = index.as_query_engine()
----> 2 response = query_engine.query(“what is a wide ball”)
3 # response

15 frames
/usr/local/lib/python3.10/dist-packages/llama_index/legacy/llms/base.py in wrapper_logic(_self)
161 callback_manager = getattr(_self, “callback_manager”, None)
162 if not isinstance(callback_manager, CallbackManager):
→ 163 raise ValueError(
164 "Cannot use llm_completion_callback on an instance "
165 “without a callback_manager attribute.”

ValueError: Cannot use llm_completion_callback on an instance without a callback_manager attribute.

Desperately looking for a solution

Resolution

pip install llama-index-llms-huggingface
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext

from llama_index.llms.huggingface import HuggingFaceLLM

from llama_index.core import PromptTemplate

from llama_index.core import Settings

import torch
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
# bnb_8bit_compute_dtype=torch.float16,
# bnb_8bit_quant_type=“nf8”,
# bnb_8bit_use_double_quant=True,
)

import torch

llm = HuggingFaceLLM(
context_window=4096,
max_new_tokens=256,
generate_kwargs={“temperature”:0.5,“do_sample”:False},
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
tokenizer_name=‘meta-llama/Llama-2-7b-chat-hf’,
model_name=‘meta-llama/Llama-2-7b-chat-hf’,
model_kwargs={“torch_dtype”: torch.float16, “quantization_config”: quantization_config},
device_map=‘auto’
)

from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

index = VectorStoreIndex.from_documents(documents, show_progress=True)

query_engine=index.as_query_engine(llm=llm, streaming=True)
response = query_engine.query(“Your query here”)

print(response)