Chatbot PDF - Only local

Olá, sou iniciante nessa area e estou estudando e implementando tarefas que possivelmente um dia poderão me ajudar. Eu estou com um problema onde eu queria criar um chatbot, onde inseria um pdf e se eu fizesse as perguntas relacionados ao pdf ele responderia. Consegui encontrar diversos videos e documentos utilizando modelos para resolver meu problema.

Porém encontrei alguns problemas:

O primeiro problema é a linguagem que a que necessito é a Português Brasil, onde não possui muitos modelos focados nessa lingua. A que obtive melhores resultados é a Mistral 7B.

O segundo problema é que quando faço perguntas mesmo inserindo o contexto do pdf ele responde informações que encontra na internet (parece que ele busca na internet essas informações e dá preferencia do que o contexto inserido no pdf).

O terceiro é que quando eu faço uma pergunta que ele não deveria saber, mesmo eu colocando no prompt para responde que ele não sabe, ele tem alucinações e inventa uma resposta.

Então resumidamente, alguem poderia me ajudar criar um chatbot que recebesse informações de um pdf e a partir de uma pergunta em Português e ele respondesse a partir dos contextos inseridos e caso não soubesse só respondesse que não sabia.

Código abaixo de como eu fiz a implementação com Langchain, Llama, Zephyr, Mistral.

import re
import gc
import torch

from typing import Optional

from langchain.chains import ConversationalRetrievalChain
from langchain.docstore.document import Document
from langchain.document_loaders import DataFrameLoader, PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch
from pandas import DataFrame, read_csv

from app.configs import DBConfig, StorageConfig


class ChatModel:
    def __init__(self):
        self.documents = []
        self.embeddings = None
        self.llm = None
        self.memory = None
        self.retriever = None
        self.df = None

        self.device = "cpu"

    def clean_memory(self):
        gc.collect()

        if self.device == "cuda":
            torch.cuda.empty_cache()

    def load_device(self):
        if torch.cuda.is_available():
            self.device = "cuda"

        else:
            self.device = "cpu"
        try:
            if torch.backends.mps.is_available():
                self.device = "mps"
        except:
            pass

    def load_pdf(self, file: str) -> int:
        documents = []

        loader = PyPDFLoader(file)
        documents = loader.load()
        self.documents = documents

        return len(self.documents)

    def remove_single_newlines(self, text: Document):
        text.page_content = re.sub(r"\n(?! \n)", "", text.page_content)
        text.page_content = text.page_content.strip()
        return text

    def format_text(self):
        formated_documents = []

        formated_documents = [
            self.remove_single_newlines(doc) for doc in self.documents
        ]

        self.documents = formated_documents

    def check_duplicity(self):
        formated_documents = []

        for doc in self.documents:
            similarity_len = (
                len(self.similarity(query=doc.page_content, threshold=2.0, k=2)) == 0
            )
            if similarity_len:
                formated_documents.append(doc)

        self.documents = formated_documents

    def load_csv(self, file: Optional[str] = None) -> int:
        if file:
            self.df = read_csv(file)

        loader = DataFrameLoader(self.df, page_content_column="Resposta")
        self.documents = loader.load()

        return len(self.documents)

    def split_pdf(self, chunk_size: int = 150, chunk_overlap: int = 1) -> int:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n"]
        )

        self.documents = text_splitter.split_documents(self.documents)

        return len(self.documents)

    def split_csv(
        self, file: str, chunk_size: int = 1500, chunk_overlap: int = 150
    ) -> int:
        df = read_csv(file, sep=";")

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

        new_df = []

        for i in range(len(df)):
            text = df["Resposta"][i]

            split_text = text_splitter.split_text(text)
            for j in range(len(split_text)):
                new_df.append(
                    [
                        split_text[j],
                    ]
                )

        self.df = DataFrame(new_df, columns=["Resposta"])

        return len(self.df)

    def get_embeddings(
        self, model_name: str = "rufimelo/Legal-BERTimbau-sts-base-ma-v2"
    ) -> None:
        self.embeddings = HuggingFaceEmbeddings(
            model_name=model_name, model_kwargs={"device": self.device}
        )

    def store(self) -> None:
        try:
            vectordb = ElasticVectorSearch.from_documents(
                documents=self.documents,
                embedding=self.embeddings,
                elasticsearch_url=DBConfig.ELASTIC_URL,
                index_name="my_index",
            )

            self.vectordb = vectordb

            vectordb.client.indices.refresh(index="my_index")
        except:
            self.vectordb = None
            raise

    def connect(self) -> None:
        try:
            vectordb = ElasticVectorSearch(
                elasticsearch_url=DBConfig.ELASTIC_URL,
                embedding=self.embeddings,
                index_name="my_index",
            )

            self.vectordb = vectordb

            vectordb.client.indices.refresh(index="my_index")
        except:
            self.vectordb = None
            raise

    def create_llm(
        self,
        temperature: float = 0.2,
        n_ctx: int = 2048,
        n_batch: int = 8192,
        n_gpu_layers: int = 60,
        n_threads: int = 8,
    ) -> None:
        self.llm = LlamaCpp(
            model_path=StorageConfig.META_URL,
            temperature=temperature,
            verbose=False,
            callbacks=[],
            n_ctx=n_ctx,
            n_batch=n_batch,
            n_gpu_layers=n_gpu_layers,
            n_threads=n_threads,
        )

    def create_memory(self) -> None:
        self.memory = ConversationBufferMemory(
            memory_key="chat_history", return_messages=True
        )

    def create_retriever(self) -> None:
        self.retriever = self.vectordb.as_retriever(search_kwargs={"k": 2})

    def create_chat_session(self) -> None:
        PROMPT_TEMPLATE = """
          ### Instrução: 
          Use as seguintes contextos para responder à pergunta no final. Se a resposta não estiver no contexto, apenas diga que não sabe, não invente uma resposta. Responda somente a pergunta. Mantenha a resposta o mais concisa possível.
          {context}
          Pergunta: {question}
          ### Resposta:
        """

        QA_CHAIN_PROMPT = PromptTemplate(
            template=PROMPT_TEMPLATE, input_variables=["question", "context"]
        )

        self.qa = ConversationalRetrievalChain.from_llm(
            self.llm,
            retriever=self.retriever,
            memory=self.memory,
            verbose=True,
            combine_docs_chain_kwargs={"prompt": QA_CHAIN_PROMPT},
        )

    def delete_chat_session(self) -> None:
        self.qa = None

    def similarity(self, query: str, threshold: float, k: int = 2) -> None:
        results = self.vectordb.similarity_search_with_score(
            query=query,
            k=k,
        )

        docs = [
            {
                "page": doc.metadata["page"],
                "file": doc.metadata["source"],
                "score": score,
                "context": doc.page_content,
            }
            for doc, score in results
            if score >= threshold
        ]

        return docs
from app.models import ChatModel
from app.configs import ColorConfig, IAConfig
from app.utils import format_answer


def message_answer_service(data) -> str:
    print(f"👤 {ColorConfig.BOLD}Pergunta: {ColorConfig.ENDC}{data['question']}")

    chat = ChatModel()

    try:
        chat.load_device()

        chat.clean_memory()

        chat.get_embeddings()

        try:
            chat.connect()
        except:
            return "Não possui dados no contexto!"

        chat.create_llm(
            temperature=0.3,
            n_ctx=IAConfig.N_CTX,
            n_batch=IAConfig.N_BATCH,
            n_gpu_layers=IAConfig.N_GPU_LAYERS,
            n_threads=IAConfig.N_THREADS,
        )

        chat.create_memory()

        chat.create_retriever()

        chat.create_chat_session()

        result = chat.qa({"question": data["question"], "chat_history": []})

        result = format_answer(result["answer"])

        print(f"🤖 {ColorConfig.BOLD}Resposta: {ColorConfig.ENDC}{result}")

        chat.clean_memory()

        chat.delete_chat_session()

        del chat

        return result

    except Exception as error:
        print("An error occurred:", type(error).__name__, "–", error)

        del chat

        return "Ocorreu um erro ao procurar a resposta!"

Hi there,

I’ve translated your text into English so that I can read it and there is a chance that the translation is incorrect but hopefully my answer is useful anyway.

One of the biggest issues with LLMs is that they are designed to produce the next best token, and they do this best for languages they have been trained the most with. Because most of the models are trained with English corpora, they produce the best results from English language input. Even though you’re telling the LLM not to produce an answer if it doesn’t know, it’s likely unable to produce the right Query-Key-Value groups for this to produce an output where this instruction is followed because the instruction is not in English.
This also might be why the LLM appears to be giving you answers that are found on the internet: there’s a strong chance they the corpora that was used to train the language for Portuguese was just resources freely available online, and it’s unlikely that it’s undergone any reinforcement learning for this specific language.

All this is to say that it’s best to translate everything to English before passing it to the LLM, and then back to Portuguese before producing the response. I haven’t delved into LangChain’s translation components but I’m sure that there are enough of them available that you can achieve what I’ve suggested with only a few adaptations to what you have so far.

Hopefully this has been helpful to you. Good luck!

2 Likes