I am new to AI. After watching many tutorials, I am trying to build an AI chatbot. I am using sentence-transformers/all-MiniLM-L6-v2
for embedding. The concept is straightforward: I have a CSV file where I’ve added all my questions and answers, and the AI should reply with the most similar answer from the CSV. For almost all questions, I am getting accurate answers, but I am facing a few issues. For example, when a user says ‘Hello’ or ‘Hi’, my AI gives a welcome message, but when someone types ‘anyone here’, it does not provide any answer, even though ‘anyone here’ is also in my CSV row. I am using the chatbot with my Django project. Here is the code.
models_loader.py
# model_loader.py
from sentence_transformers import SentenceTransformer
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import os
# Global variables to hold the model and data
embedding_model = None # This will hold the model
embeddings = None # This will hold the tensor of embeddings
chunk_texts = []
def load_all():
global embeddings, chunk_texts, embedding_model
# Load a different model for potentially better embeddings
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Define the path to your CSV file
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
csv_file = os.path.join(base_dir, 'sample_support_data.csv')
try:
# Try loading with UTF-8 encoding first
data = pd.read_csv(csv_file, encoding='utf-8')
except UnicodeDecodeError:
try:
# If UTF-8 fails, try loading with ISO-8859-1
data = pd.read_csv(csv_file, encoding='ISO-8859-1')
except UnicodeDecodeError:
# As a last resort, try windows-1252
data = pd.read_csv(csv_file, encoding='windows-1252')
# Create a combined text entry for embeddings without preprocessing that removes useful content
chunk_texts = []
for index, row in data.iterrows():
question = row["Question/Issue"] if pd.notnull(row["Question/Issue"]) else ''
answer = row["Answer/Response"] if pd.notnull(row["Answer/Response"]) else ''
category = row["Category"] if pd.notnull(row["Category"]) else ''
combined_entry = f"Question/Issue: {question}\nAnswer/Response: {answer}\nCategory: {category}"
chunk_texts.append(combined_entry)
# Generate embeddings for the entire combined text
embeddings = embedding_model.encode(chunk_texts, convert_to_tensor=True)
def preprocess_text(text):
"""Preprocesses the text by removing stop words and punctuation."""
words = text.lower().split()
words = [word for word in words if word not in ENGLISH_STOP_WORDS]
return " ".join(re.sub(r'[^\w\s]', '', word) for word in words)
my views.py
@api_view(['POST'])
def answer_question(request):
if not request.data.get('question'):
return Response({"error": "Question not provided"}, status=400)
# Get the raw question and preprocess it
question = request.data['question'].strip().lower() # Convert to lowercase for matching
# Use the embedding model to encode the question
question_embedding = embedding_model.encode(preprocess_text(question), convert_to_tensor=True)
similarities = util.cos_sim(question_embedding, embeddings)[0]
top_indices = similarities.argsort(descending=True)[:2]
if similarities[top_indices[0]] < 0.35: # Adjust the threshold if needed
return Response({"error": "No relevant information found"}, status=404)
best_context = chunk_texts[top_indices[0]]
# Extracting the components from the best context
question_issue_match = re.search(r"Question/Issue:\s*(.*?)\nAnswer/Response:", best_context, re.DOTALL)
answer_response_match = re.search(r"Answer/Response:\s*(.*?)\nCategory:", best_context, re.DOTALL)
category_match = re.search(r"Category:\s*(.*)", best_context, re.DOTALL)
question_issue = question_issue_match.group(1).strip() if question_issue_match else "No specific question found"
answer_response = answer_response_match.group(1).strip() if answer_response_match else "No specific answer found"
category = category_match.group(1).strip() if category_match else "No specific category found"
# Structuring the response
structured_response = {
"Question": question_issue,
"Answer": answer_response,
"Category": category
}
return Response({"answer": structured_response}, status=200)
Also, for a few questions, I am not getting accurate answers. Can anyone please help me improve this to get more accurate answers?