Hi
Am new into this area. I want to build private chat using customer pdf documents and response similar to chatGPT response. I build the code but response is one line statement not like chatGPT. I tried various ways but failed to get the desired result.
This is my code
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.vectorstores import FAISS # Updated import
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings # Updated import
from langchain.chains import RetrievalQA
from langchain.llms.huggingface_pipeline import HuggingFacePipeline # Corrected import
from transformers import pipeline
Paths
DATA_DIR = “./trainingdata”
VECTOR_STORE_PATH = “./vector_store”
MODEL_NAME = “E:/Workings/Python/chatgt-v3-opelmodal/custom_model_dir/models–google–flan-t5-large”
Split documents into smaller chunks
def load_and_process_documents(data_dir):
from langchain_community.document_loaders import PyPDFLoader # Updated import
documents =
for file in os.listdir(data_dir):
if file.endswith(“.pdf”):
loader = PyPDFLoader(os.path.join(data_dir, file))
documents.extend(loader.load())
# Split documents into smaller chunks
splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=100)
return splitter.split_documents(documents)
Create or load vector store
def create_or_load_vectorstore(docs):
embedding_model = HuggingFaceEmbeddings(model_name=“sentence-transformers/all-MiniLM-L6-v2”)
if os.path.exists(VECTOR_STORE_PATH):
print(“Loading existing vectorstore…”)
vectorstore = FAISS.load_local(VECTOR_STORE_PATH, embedding_model, allow_dangerous_deserialization=True)
else:
print(“Creating new vectorstore…”)
vectorstore = FAISS.from_documents(docs, embedding_model)
vectorstore.save_local(VECTOR_STORE_PATH)
return vectorstore
Initialize generative language model pipeline
def load_llm_pipeline():
print(“Loading generative language model…”)
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, local_files_only=True)
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, local_files_only=True)
hf_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
return HuggingFacePipeline(pipeline=hf_pipeline)
Summarization pipeline
def summarize_context(context, max_length=512):
summarizer = pipeline(“summarization”, model=MODEL_NAME, tokenizer=“google/flan-t5-large”)
summary = summarizer(context, max_length=max_length, min_length=50, truncation=True)[0][“summary_text”]
return summary
def retrieve_context(query, retriever):
“”"
Retrieve relevant chunks for a given query using the retriever.
“”"
retrieved_docs = retriever.invoke(query) # Use the updated method invoke
combined_context = " ".join([doc.page_content for doc in retrieved_docs[:3]]) # Top 3 chunks
return combined_context
def summarize_context(context, gen_pipeline, max_length=512):
“”"
Summarize the context if it exceeds the model’s input token limit.
“”"
if len(context.split()) > max_length:
summarizer_prompt = f"Summarize the following context to fit within {max_length} tokens:\n\n{context}"
try:
summarized = gen_pipeline.invoke(summarizer_prompt) # Use .invoke()
if isinstance(summarized, list) and “generated_text” in summarized[0]:
return summarized[0][“generated_text”]
else:
raise ValueError(“Unexpected output format from gen_pipeline.”)
except Exception as e:
print(“Error during summarization:”, str(e))
return “The context is too long to summarize effectively.”
return context
def generate_response(query, retriever, gen_pipeline):
“”"
Generate a response using the context retrieved.
“”"
# Retrieve context
context = retrieve_context(query, retriever)
# Summarize context if too long
max_context_length = 512
context = summarize_context(context, gen_pipeline, max_length=max_context_length)
# Prompt engineering
prompt_template = """
You are an intelligent assistant. Based on the company documents, generate a detailed and precise response to the user's question.
Context: {context}
Question: {question}
Answer:
"""
prompt = prompt_template.format(context=context, question=query)
# Generate response
try:
output = gen_pipeline.invoke(prompt) # Use .invoke()
if isinstance(output, list) and "generated_text" in output[0]:
response = output[0]["generated_text"]
else:
#raise ValueError("Unexpected output format from gen_pipeline.")
response = output
except Exception as e:
print("Error generating response:", str(e))
response = "I'm sorry, I couldn't generate a response."
return response
def main():
# Load and process data
print(“Loading and processing documents…”)
documents = load_and_process_documents(DATA_DIR)
# Create or load vector store
vectorstore = create_or_load_vectorstore(documents)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})
# Load generative model
gen_pipeline = load_llm_pipeline()
print("Chatbot is ready. Ask questions (type 'exit' to quit).")
while True:
query = input("\nEnter your question: ")
if query.lower() == "exit":
break
try:
print("\nRetrieving relevant context...")
# Generate response using retriever
response = generate_response(query, retriever, gen_pipeline)
print("\nAnswer:", response)
except Exception as e:
print("Error:", e)
if name == “main”:
main()
Sample Prompt:
Enter your question: intervening holidays count?
Retrieving relevant context…
Retrieved Context is 1. Purpose
The objective is to provide information to all the employees about the leaves and
holidays followed in UPL India. Employees need adequate time to celebrate festival
holidays, rest and recuperate and spend quality time with family and friends. This policy
is effective from 1st October 2020.
-
Scope
All permanent employees, trainees, will be covered by this leave policy except the
manufacturing employees in “worker” category with specific employment conditions.
All new joiners will be covered from the date of joining. -
Process Description
For all leave calculations, calendar year will be from 1 January to 31 December.
New joiners leave eligibilities will be prorated till December 31st, from the date of
joining.
Leaves will be calculated on working days only and will not include intervening weekly
offs and holidays.
-
Types of Holidays & Leaves:
-
National & Festival Holidays
-
Privilege Leave
-
Casual Leave
-
Sick Leave
-
Maternity Leave
-
Paternity Leave
4.1 National and Festival Holidays:
ď‚· All offices / plants / labs / field stations etc., will observe 3 national holidays viz:
26th January, 15th August & 2nd October.
ď‚· UPL will follow the statutory holidays including the above three national holidays per
calendar year. The list of festival holidays will be in line with the local festivals of the
respective state / location and may vary from state to state depending on the culture and
other social sentiments.
ď‚· The actual holiday list for the following year will be published annually by December 15th
of the previous calendar year.
Answer: Leaves will be calculated on working days only and will not include intervening weekly offs and holidays.
Here the system retrives the context similar to question but answer is not generative text
My PDF files are belongs to employee leave policies and HR policies.