How to improve my ai bot for get more accurate answer?

I am new to AI. After watching many tutorials, I am trying to build an AI chatbot. I am using sentence-transformers/all-MiniLM-L6-v2 for embedding. The concept is straightforward: I have a CSV file where I’ve added all my questions and answers, and the AI should reply with the most similar answer from the CSV. For almost all questions, I am getting accurate answers, but I am facing a few issues. For example, when a user says ‘Hello’ or ‘Hi’, my AI gives a welcome message, but when someone types ‘anyone here’, it does not provide any answer, even though ‘anyone here’ is also in my CSV row. I am using the chatbot with my Django project. Here is the code.

models_loader.py


# model_loader.py
from sentence_transformers import SentenceTransformer
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import os

# Global variables to hold the model and data
embedding_model = None  # This will hold the model
embeddings = None       # This will hold the tensor of embeddings
chunk_texts = []

def load_all():
    global embeddings, chunk_texts, embedding_model
    # Load a different model for potentially better embeddings
    embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    # Define the path to your CSV file
    base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    csv_file = os.path.join(base_dir, 'sample_support_data.csv')

    try:
        # Try loading with UTF-8 encoding first
        data = pd.read_csv(csv_file, encoding='utf-8')
    except UnicodeDecodeError:
        try:
            # If UTF-8 fails, try loading with ISO-8859-1
            data = pd.read_csv(csv_file, encoding='ISO-8859-1')
        except UnicodeDecodeError:
            # As a last resort, try windows-1252
            data = pd.read_csv(csv_file, encoding='windows-1252')

    # Create a combined text entry for embeddings without preprocessing that removes useful content
    chunk_texts = []
    for index, row in data.iterrows():
        question = row["Question/Issue"] if pd.notnull(row["Question/Issue"]) else ''
        answer = row["Answer/Response"] if pd.notnull(row["Answer/Response"]) else ''
        category = row["Category"] if pd.notnull(row["Category"]) else ''
        combined_entry = f"Question/Issue: {question}\nAnswer/Response: {answer}\nCategory: {category}"
        chunk_texts.append(combined_entry)

    # Generate embeddings for the entire combined text
    embeddings = embedding_model.encode(chunk_texts, convert_to_tensor=True)

def preprocess_text(text):
    """Preprocesses the text by removing stop words and punctuation."""
    words = text.lower().split()
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]
    return " ".join(re.sub(r'[^\w\s]', '', word) for word in words)

my views.py


@api_view(['POST'])
def answer_question(request):
    if not request.data.get('question'):
        return Response({"error": "Question not provided"}, status=400)

    # Get the raw question and preprocess it
    question = request.data['question'].strip().lower()  # Convert to lowercase for matching

    # Use the embedding model to encode the question
    question_embedding = embedding_model.encode(preprocess_text(question), convert_to_tensor=True)
    similarities = util.cos_sim(question_embedding, embeddings)[0]
    top_indices = similarities.argsort(descending=True)[:2]

    if similarities[top_indices[0]] < 0.35:  # Adjust the threshold if needed
        return Response({"error": "No relevant information found"}, status=404)

    best_context = chunk_texts[top_indices[0]]

    # Extracting the components from the best context
    question_issue_match = re.search(r"Question/Issue:\s*(.*?)\nAnswer/Response:", best_context, re.DOTALL)
    answer_response_match = re.search(r"Answer/Response:\s*(.*?)\nCategory:", best_context, re.DOTALL)
    category_match = re.search(r"Category:\s*(.*)", best_context, re.DOTALL)

    question_issue = question_issue_match.group(1).strip() if question_issue_match else "No specific question found"
    answer_response = answer_response_match.group(1).strip() if answer_response_match else "No specific answer found"
    category = category_match.group(1).strip() if category_match else "No specific category found"

    # Structuring the response
    structured_response = {
        "Question": question_issue,
        "Answer": answer_response,
        "Category": category
    }

    return Response({"answer": structured_response}, status=200)

Also, for a few questions, I am not getting accurate answers. Can anyone please help me improve this to get more accurate answers?

1 Like